# main

https://www.kaggle.com/competitions/home-data-for-ml-course/overview

In [68]:
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [69]:
test_data = pd.read_csv('/data/notebook_files/test.csv')
train_data = pd.read_csv('/data/notebook_files/train.csv')
sample_submission = pd.read_csv('/data/notebook_files/sample_submission.csv')

test_data_ids = test_data['Id'].astype(int).copy()
test_data = test_data.drop('Id', axis=1)
train_data = train_data.drop('Id', axis=1)

In [70]:
train_data.sample(5)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
724,20,RL,86.0,13286,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,2,2009,WD,Normal,320000
323,20,RM,49.0,5820,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,7,2006,WD,Normal,126175
1245,80,RL,78.0,12090,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,1,2007,WD,Abnorml,178000
1123,20,RL,50.0,9405,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,6,2009,WD,Normal,118000
1308,20,RM,100.0,12000,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdWo,,0,5,2008,WD,Normal,147000


In [71]:
train_data.dtypes.reset_index()

Unnamed: 0,index,0
0,MSSubClass,int64
1,MSZoning,object
2,LotFrontage,float64
3,LotArea,int64
4,Street,object
...,...,...
75,MoSold,int64
76,YrSold,int64
77,SaleType,object
78,SaleCondition,object


In [72]:
test_data.sample(5)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1107,20,RL,,25485,Pave,,IR1,Lvl,AllPub,CulDSac,...,0,0,,,,0,5,2007,WD,Normal
21,120,RL,41.0,7132,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal
957,20,RL,71.0,8540,Pave,,Reg,Lvl,AllPub,Corner,...,0,0,,,,0,6,2007,WD,Normal
1384,20,RL,60.0,7200,Pave,,Reg,Low,AllPub,Inside,...,0,0,,MnPrv,,0,3,2006,WD,Normal
1241,20,RL,70.0,9100,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,7,2006,WD,Normal


In [73]:
test_data.dtypes.reset_index()

Unnamed: 0,index,0
0,MSSubClass,int64
1,MSZoning,object
2,LotFrontage,float64
3,LotArea,int64
4,Street,object
...,...,...
74,MiscVal,int64
75,MoSold,int64
76,YrSold,int64
77,SaleType,object


In [74]:
sample_submission.sample(5)

Unnamed: 0,Id,SalePrice
1212,2673,154672.422763
517,1978,208232.716273
1255,2716,159851.977739
520,1981,164400.756416
1446,2907,167411.028352


In [75]:
null_train_data = train_data.isna().sum()
null_train_data[null_train_data > 0].reset_index()

Unnamed: 0,index,0
0,LotFrontage,259
1,Alley,1369
2,MasVnrType,8
3,MasVnrArea,8
4,BsmtQual,37
5,BsmtCond,37
6,BsmtExposure,38
7,BsmtFinType1,37
8,BsmtFinType2,38
9,Electrical,1


In [76]:
null_test_data = test_data.isna().sum()
null_test_data[null_test_data > 0].reset_index()

Unnamed: 0,index,0
0,MSZoning,4
1,LotFrontage,227
2,Alley,1352
3,Utilities,2
4,Exterior1st,1
5,Exterior2nd,1
6,MasVnrType,16
7,MasVnrArea,15
8,BsmtQual,44
9,BsmtCond,45


In [77]:
# defining categorial and number columns

cat_cols_train_data = list(train_data.select_dtypes(include='object').columns)
num_cols_train_data = list(train_data.select_dtypes(include=[np.number]).columns.difference(['SalePrice']))

cat_cols_test_data = list(test_data.select_dtypes(include='object').columns)
num_cols_test_data = list(test_data.select_dtypes(include=[np.number]).columns.difference(['SalePrice']))

In [78]:
# проверяем сколько None создалось
null_test_data = test_data.isna().sum()
null_test_data[null_test_data > 0].reset_index()

Unnamed: 0,index,0
0,MSZoning,4
1,LotFrontage,227
2,Alley,1352
3,Utilities,2
4,Exterior1st,1
5,Exterior2nd,1
6,MasVnrType,16
7,MasVnrArea,15
8,BsmtQual,44
9,BsmtCond,45


In [79]:
# working with null values

from sklearn.impute import SimpleImputer

cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='mean')

train_data[num_cols_train_data] = num_imputer.fit_transform(train_data[num_cols_train_data])
train_data[cat_cols_train_data] = cat_imputer.fit_transform(train_data[cat_cols_train_data])

test_data[num_cols_test_data] = num_imputer.transform(test_data[num_cols_test_data])
test_data[cat_cols_test_data] = cat_imputer.transform(test_data[cat_cols_test_data])

In [80]:
# turning to null the values in test data which are not available in train data

for i in cat_cols_test_data:
    if i in cat_cols_train_data:
        train_categories = train_data[i].unique()
        mask = ~test_data[i].isin(train_categories)
        test_data.loc[mask, i] = None

In [81]:
# oen more time using imputer
test_data[cat_cols_test_data] = cat_imputer.transform(test_data[cat_cols_test_data]) 

In [82]:
null_train_data = train_data.isna().sum()
null_train_data[null_train_data > 0].reset_index()

Unnamed: 0,index,0


In [83]:
null_test_data = test_data.isna().sum()
null_test_data[null_test_data > 0].reset_index()

Unnamed: 0,index,0


In [84]:
# turning onject data into categorial in order to give lightbm model to chance work with them

train_data[cat_cols_train_data] = train_data[cat_cols_train_data].astype('category')
test_data[cat_cols_test_data] = test_data[cat_cols_test_data].astype('category')  

In [85]:
train_data_checking = train_data.select_dtypes(exclude=[np.number, 'category'])
list(train_data_checking.columns)

[]

In [86]:
test_data_checking = test_data.select_dtypes(exclude=[np.number, 'category'])
list(test_data_checking.columns)

[]

In [87]:
from sklearn.model_selection import train_test_split

X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_val.shape)
print(y_train.shape, y_val.shape)

(1168, 79) (292, 79)
(1168,) (292,)


In [88]:
!pip install lightgbm
!pip install optuna

import optuna 
optuna.logging.set_verbosity(optuna.logging.ERROR)
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score

def objective(trial):
    params_1 = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.05, 2),
        "num_leaves": trial.suggest_int("num_leaves", 2, 200),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 200),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.1, 3),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 3)
    }
    model_1 = LGBMRegressor(**params_1, random_state=42, verbose=-1)
    score_1 = cross_val_score(model_1, X_train, y_train, cv=5,
                              scoring='neg_mean_absolute_error',
                              fit_params={'categorical_feature':cat_cols_train_data}
                              ).mean()
    return score_1

study_1 = optuna.create_study(direction='maximize')
study_1.optimize(objective, n_trials=100)
print(study_1.best_value)
print(study_1.best_params)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
-16387.742452269886
{'n_estimators': 102, 'learning_rate': 0.06047764653034577, 'num_leaves': 168, 'min_child_samples': 27, 'reg_alpha': 0.24527338935942944, 'reg_lambda': 2.478921458197021}


In [89]:
from sklearn.metrics import mean_absolute_error, r2_score

model_1 = LGBMRegressor(
    **study_1.best_params,
    random_state=42,
    verbose=-1
)

model_1.fit(X_train, y_train, categorical_feature=cat_cols_train_data)
prediction_model_1 = model_1.predict(X_val)

mae_model_1 = mean_absolute_error(y_val, prediction_model_1)
r2_model_1 = r2_score(y_val, prediction_model_1)

print(f"{mae_model_1:.4f}")
print(f"{r2_model_1:.4f}") 

16109.6471
0.8934


In [91]:
prediction_submission = model_1.predict(test_data)

submission = pd.DataFrame({
    'Id': test_data_ids,
    'SalePrice': prediction_submission
})

submission.to_csv('submission_file_housing_prices.csv', index=False)