<a href="https://colab.research.google.com/github/VadbOss/Vadim-s_repa/blob/main/california_house_040522.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
data = pd.read_csv('/content/sample_data/california_housing_train.csv')
data_test = pd.read_csv('/content/sample_data/california_housing_test.csv')

In [45]:
df = pd.DataFrame(data)
df_test = pd.DataFrame(data_test)

In [None]:
df.head()

In [None]:
#df.info()

In [46]:
X_train = df.drop(['median_house_value'], axis=1)
y_train = df['median_house_value']

In [47]:
X_test = df_test.drop(['median_house_value'], axis=1)
y_test = df_test['median_house_value']

In [48]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(df.shape)
print(df_test.shape)

(17000, 8)
(3000, 8)
(17000,)
(3000,)
(17000, 9)
(3000, 9)


In [None]:
#df.info()
#df.isna().sum()/df.shape[0]
df.describe()

In [None]:
#sns.pairplot(df)

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(method='spearman'), annot=True, fmt='.1f');

# Строим модели

In [50]:
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn import preprocessing
import lightgbm
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [58]:
lr = LinearRegression()
scores = cross_val_score(lr, X_train, y_train, cv=5, n_jobs= -1, scoring= 'neg_mean_absolute_error')
print('LinearRegression cross validation MAE: ', -np.mean(scores))

LinearRegression cross validation MAE:  53524.10779960257


In [60]:
dt = DecisionTreeRegressor()
scores = cross_val_score(dt, X_train, y_train, cv=5, n_jobs= -1, scoring= 'neg_mean_absolute_error')
print('DecisionTreeRegression cross validation MAE: ', -np.mean(scores))

DecisionTreeRegression cross validation MAE:  74002.28511764706


In [62]:
rf = RandomForestRegressor(random_state=47)
scores = cross_val_score(rf, X_train, y_train, cv=5, n_jobs= -1, scoring= 'neg_mean_absolute_error')
print('RandomForestRegressor cross validation MAE: ', -np.mean(scores))

RandomForestRegressor cross validation MAE:  62734.098148235295


In [64]:
lgb = lightgbm.LGBMRegressor(random_state=47)
scores = cross_val_score(lgb, X_train, y_train, cv=5, n_jobs= -1, scoring= 'neg_mean_absolute_error')
print('Cross validation MAE: ', -np.mean(scores))

Cross validation MAE:  55895.86205199895


In [65]:
def cv_params(model, param_grid):
  scoring = 'neg_mean_absolute_error'

  opt_params = GridSearchCV(
      estimator= model,                  # Модель
      param_grid= param_grid,            # Параметры
      scoring= scoring,                  # Стратегия валидации
      cv= 5,                             # Количество слоев кросс-валидации
      n_jobs= -1)                        # Количество потоков для обучения, -1 = все

  opt_params.fit(X_train, y_train)
  params = opt_params.best_params_
  best_score = opt_params.best_score_

  print(f"Best score: {round(-best_score, 2)}")
  print(f"Best parameters: {params}")

  return params

In [66]:
lgb_param_grid = {
    'max_depth': [4, 10, 15, -1],                 # Максимальная глубина дерева
    'num_leaves': [25, 35, 45],                   # Максимальное количество листьев на деревьях
    'n_estimators': [41, 100, 250, 500, 600]      # Количество деревьев
}

lgb_clean = lightgbm.LGBMRegressor(random_state=1)
lgb_params = cv_params(lgb_clean, lgb_param_grid)

Best score: 53871.03
Best parameters: {'max_depth': 4, 'n_estimators': 250, 'num_leaves': 25}


In [67]:
print(lgb_params)

{'max_depth': 4, 'n_estimators': 250, 'num_leaves': 25}


In [68]:
rf_param_grid = {
    'max_depth': [20, 25],
    'n_estimators': [500, 800]
}

rf_clean = RandomForestRegressor(random_state=1)
rf_params = cv_params(rf_clean, rf_param_grid)



Best score: 62216.33
Best parameters: {'max_depth': 20, 'n_estimators': 500}


# Итоговая модель на лучших параметрах

In [69]:
lgb = lightgbm.LGBMRegressor(**lgb_params)
lgb.fit(X_train, y_train)

preds = lgb.predict(X_test)

print(f"MAPE: {round(mean_absolute_percentage_error(y_test, preds)*100, 2)}%")
print(f"MAE: {round(mean_absolute_error(y_test, preds), 2)}")

MAPE: 18.71%
MAE: 32943.83


In [72]:
results = pd.DataFrame({'Model': np.round(preds, 1), 'Actual': y_test})
results.reset_index().drop('index', axis=1)  

Unnamed: 0,Model,Actual
0,422319.4,344700.0
1,205899.3,176500.0
2,264928.3,270500.0
3,271900.3,330000.0
4,77244.1,81700.0
...,...,...
2995,275310.3,225000.0
2996,213805.2,237200.0
2997,63991.4,62000.0
2998,173566.2,162500.0
