In [1]:
import pandas as pd
import numpy as np

## Импорт данных

In [2]:
X = pd.read_csv("data/samples.csv")
X.head()

Unnamed: 0,Категория,Цель в долларах,Срок,Год публикации,Close_brent,CAD,CHF,DKK,EUR,GBP,...,Design,Fashion,Film & Video,Food,Journalism,Music,Photography,Publishing,Technology,Theater
0,6035.989239,1000.0,39,2009,34.41,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,3591.033473,80000.0,87,2009,34.41,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,3661.42455,20.0,8,2009,34.41,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4321.245721,99.0,79,2009,34.41,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,6035.989239,1900.0,28,2009,34.41,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [5]:
X.shape

(331675, 32)

In [6]:
Y = pd.read_csv("data/targets_linreg.csv")
# Y = Y['таргет2']
Y.head()
Y.shape

(331675, 1)

## Построение простой модели линейной регрессии

In [7]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X, Y)

X['Предсказание'] = model.predict(X)

X.head()

Unnamed: 0,Категория,Цель в долларах,Срок,Год публикации,Close_brent,CAD,CHF,DKK,EUR,GBP,...,Fashion,Film & Video,Food,Journalism,Music,Photography,Publishing,Technology,Theater,Предсказание
0,6035.989239,1000.0,39,2009,34.41,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,3125.804988
1,3591.033473,80000.0,87,2009,34.41,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,5117.789402
2,3661.42455,20.0,8,2009,34.41,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1680.580057
3,4321.245721,99.0,79,2009,34.41,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,4935.864467
4,6035.989239,1900.0,28,2009,34.41,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,2183.699047


## Расчёт ошибок для полученной модели

In [8]:
MSE = np.round(((X['Предсказание'] - Y['таргет2'])**2).mean(), 3)
RMSE = np.round((((X['Предсказание'] - Y['таргет2'])**2).mean()) ** 0.5, 3)
MAE = np.round((abs(X['Предсказание'] - Y['таргет2']).mean()), 3)
R_square = np.round(1 - (((Y['таргет2'] - X['Предсказание'])**2).sum()/((Y['таргет2'] - Y['таргет2'].mean())**2).sum()), 3)


print(f"MSE полученной модели: {MSE}")
print(f"RMSE полученной модели: {RMSE}")
print(f"MAE полученной модели: {MAE}")
print(f"R^2 полученной модели: {R_square}")


MSE полученной модели: 9201867212.576
RMSE полученной модели: 95926.363
MAE полученной модели: 13854.018
R^2 полученной модели: 0.017


## Оценка на тестовых данных

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
final_model = LinearRegression().fit(X_train, Y_train)
y_pred = final_model.predict(X_test)
MSE = np.round(((y_pred - Y_test)**2).mean(), 3)
print(f"Test MSE: {float(MSE)}")

Test MSE: 10499413799.0


  print(f"Test MSE: {float(MSE)}")


## Применение кросс-валидации

### K-Folds

In [12]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold

cv = KFold(n_splits=10, shuffle=True, random_state=42)

#### - Ручками

In [13]:
losses_train = []
losses_test = []

for train_index, test_index in cv.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    Y_train, Y_test = Y.values[train_index], Y.values[test_index]

    model = LinearRegression()
    model.fit(X_train, Y_train)

    losses_test.append(np.mean((model.predict(X_test)-Y_test)**2)**(1/2))
    losses_train.append(np.mean((model.predict(X_train)-Y_train)**2)**(1/2))

In [14]:
int(np.mean(losses_test)), int(np.mean(losses_train))

(93083, 95888)

#### - Метод cross_val_score из sklearn

In [15]:
model = LinearRegression()

mse_scores = -cross_val_score(model, X, Y, cv = cv, scoring="neg_mean_squared_error")
r2_scores = cross_val_score(model, X, Y, cv = cv, scoring="r2")
print(f"MSE на кросс-валидации: {np.mean(mse_scores):.3f}")
print(f"R2 на кросс-валидации: {np.mean(r2_scores):.3f}")

MSE на кросс-валидации: 9203339501.410
R2 на кросс-валидации: 0.019


## Убрать мультиколинеарность

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

scaler_x = StandardScaler()

losses_train = []
losses_test = []

for train_index, test_index in cv.split(X):
    X_train, X_test = X.values[train_index], X.values[test_index]
    Y_train, Y_test = Y.values[train_index], Y.values[test_index]

    scaler_x = StandardScaler()
    scaler_x.fit(X_train)
    X_train_transformed = scaler_x.transform(X_train)
    X_test_transformed = scaler_x.transform(X_test)



    model = Lasso()

    model.fit(X_train_transformed, Y_train)

    losses_test.append(np.mean((model.predict(X_test_transformed)-Y_test)**2)**(1/2))
    losses_train.append(np.mean((model.predict(X_train_transformed)-Y_train)**2)**(1/2))







  model = cd_fast.enet_coordinate_descent(


In [None]:
int(np.mean(losses_test)), int(np.mean(losses_train))

(0, 1)

In [None]:


model = LinearRegression()



mse_scores = -cross_val_score(model, X, Y, cv = cv, scoring="neg_mean_squared_error")
r2_scores = cross_val_score(model, X, Y, cv = cv, scoring="r2")
print(f"MSE на кросс-валидации: {np.mean(mse_scores):.3f}")
print(f"R2 на кросс-валидации: {np.mean(r2_scores):.3f}")


MSE на кросс-валидации: 9203339501.410
R2 на кросс-валидации: 0.019


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
final_model = LinearRegression().fit(X_train, Y_train)
y_pred = final_model.predict(X_test)
MSE = np.round(((y_pred - Y_test)**2).mean(), 3)
print(f"Test MSE: {MSE:.4f}")


Test MSE: 10499413799.0000


## Убрать мультиколинеарность

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

scaler_x = StandardScaler()

scaler_x.fit(X_train)
X_train_transformed = scaler_x.transform(X_train)
X_test_transformed = scaler_x.transform(X_test)

model_Lasso = Lasso()
model_Lasso.fit(X_train_transformed, Y_train)

mse_scores = -cross_val_score(model_Lasso, X_train_transformed, Y_train, cv = cv, scoring="neg_mean_squared_error")
print(f"MSE на кросс-валидации: {np.mean(mse_scores):.3f}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


MSE на кросс-валидации: 8879458836.535


  model = cd_fast.enet_coordinate_descent(
