In [1]:
import pandas as pd
import numpy as np

## Импорт данных

In [3]:
X = pd.read_csv("data/samples.csv")
X.head()

Unnamed: 0,Категория,Цель в долларах,Срок,Год публикации,Close_brent,CAD,CHF,DKK,EUR,GBP,...,Design,Fashion,Film & Video,Food,Journalism,Music,Photography,Publishing,Technology,Theater
0,6035.989239,1000.0,39,2009,34.41,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,3591.033473,80000.0,87,2009,34.41,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3661.42455,20.0,8,2009,34.41,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4321.245721,99.0,79,2009,34.41,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,6035.989239,1900.0,28,2009,34.41,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


## Построение простой модели линейной регрессии

In [10]:
Y = pd.read_csv("data/targets.csv")
Y = Y['таргет2']
Y.head()

0    625.0
1     22.0
2     35.0
3    145.0
4    387.0
Name: таргет2, dtype: float64

In [11]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X, Y)

X['Предсказание'] = model.predict(X)

X.head()

Unnamed: 0,Категория,Цель в долларах,Срок,Год публикации,Close_brent,CAD,CHF,DKK,EUR,GBP,...,Fashion,Film & Video,Food,Journalism,Music,Photography,Publishing,Technology,Theater,Предсказание
0,6035.989239,1000.0,39,2009,34.41,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,3125.804988
1,3591.033473,80000.0,87,2009,34.41,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,5117.789402
2,3661.42455,20.0,8,2009,34.41,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1680.580057
3,4321.245721,99.0,79,2009,34.41,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,4935.864467
4,6035.989239,1900.0,28,2009,34.41,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,2183.699047


In [12]:
Y

0         625.00
1          22.00
2          35.00
3         145.00
4         387.00
           ...  
331670    269.84
331671    544.77
331672    202.00
331673    265.00
331674    513.60
Name: таргет2, Length: 331675, dtype: float64

## Расчёт ошибок для полученной модели

In [13]:
MSE = np.round(((X['Предсказание'] - Y)**2).mean(), 3)
RMSE = np.round((((X['Предсказание'] - Y)**2).mean()) ** 0.5, 3)
MAE = np.round((abs(X['Предсказание'] - Y).mean()), 3)
R_square = np.round(1 - (((Y - X['Предсказание'])**2).sum()/((Y - Y.mean())**2).sum()), 3)


print(f"MSE полученной модели: {MSE}")
print(f"RMSE полученной модели: {RMSE}")
print(f"MAE полученной модели: {MAE}")
print(f"R^2 полученной модели: {R_square}")


MSE полученной модели: 9201867212.576
RMSE полученной модели: 95926.363
MAE полученной модели: 13854.018
R^2 полученной модели: 0.017


## Применение кросс-валидации

In [16]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold

model = LinearRegression()

cv = KFold(n_splits=10, shuffle=True, random_state=42)

mse_scores = -cross_val_score(model, X, Y, cv = cv, scoring="neg_mean_squared_error")
r2_scores = cross_val_score(model, X, Y, cv = cv, scoring="r2")
print(f"MSE на кросс-валидации: {np.mean(mse_scores):.3f}")
print(f"R2 на кросс-валидации: {np.mean(r2_scores):.3f}")


MSE на кросс-валидации: 9203339501.410
R2 на кросс-валидации: 0.019


## Оценка на тестовых данных

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
final_model = LinearRegression().fit(X_train, Y_train)
y_pred = final_model.predict(X_test)
MSE = np.round(((y_pred - Y_test)**2).mean(), 3)
print(f"Test MSE: {MSE:.4f}")


Test MSE: 10499413799.0000
