In [None]:
from google.colab import files
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 62 bytes


In [None]:
!kaggle competitions download -c home-data-for-ml-course
!unzip home-data-for-ml-course.zip

Downloading home-data-for-ml-course.zip to /content
  0% 0.00/386k [00:00<?, ?B/s]
100% 386k/386k [00:00<00:00, 77.4MB/s]
Archive:  home-data-for-ml-course.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: sample_submission.csv.gz  
  inflating: test.csv                
  inflating: test.csv.gz             
  inflating: train.csv               
  inflating: train.csv.gz            


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# PREPROCESSING des données

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
missing_values = train_data.isnull().sum().sort_values(ascending=False)
print("Valeus manquantes dans les colonnes:\n", missing_values[missing_values > 0])

Valeus manquantes dans les colonnes:
 PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64


### On supprime les colonnes avec plus de 50% de données manquantes

In [None]:
threshold = 0.5
missing_percentage = train_data.isnull().sum() / len(train_data)
columns_to_drop = missing_percentage[missing_percentage > threshold].index
train_data = train_data.drop(columns=columns_to_drop, axis=1)
print("Colonnes supprimées :", columns_to_drop.tolist())

Colonnes supprimées : ['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature']


On traite les colonnes à valeurs numériques : On remplit les 'trous' avec les valeurs médianes

In [None]:
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
train_data[numerical_cols] = train_data[numerical_cols].fillna(train_data[numerical_cols].median())

On fait du one-hot encoding pour traiter les colonnes avec des valeurs non numériques, en créent des 'dummy' colonnes


In [None]:
train_data = pd.get_dummies(train_data, drop_first=True)

In [None]:
train_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,False,False,False,False,True,False,False,False,True,False
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,False,False,False,False,True,False,False,False,True,False
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,False,False,False,False,True,False,False,False,True,False
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,False,False,False,False,True,False,False,False,False,False
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,False,False,False,False,True,False,False,False,True,False


Par exemple : Il y a 3 nouvelles colonnes pour LandContour: [LandContour_HLS, LandContour_Low, LandContour_Lvl]

In [None]:
column_names = train_data.columns.tolist()
print('Nombre de colonnes après preprocessing : ' +  str(len(column_names)))
print('Liste des colonnes : ' + ', '.join(column_names))

Nombre de colonnes après preprocessing : 235
Liste des colonnes : Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, YearBuilt, YearRemodAdd, MasVnrArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, LowQualFinSF, GrLivArea, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGr, KitchenAbvGr, TotRmsAbvGrd, Fireplaces, GarageYrBlt, GarageCars, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea, MiscVal, MoSold, YrSold, SalePrice, MSZoning_FV, MSZoning_RH, MSZoning_RL, MSZoning_RM, Street_Pave, LotShape_IR2, LotShape_IR3, LotShape_Reg, LandContour_HLS, LandContour_Low, LandContour_Lvl, Utilities_NoSeWa, LotConfig_CulDSac, LotConfig_FR2, LotConfig_FR3, LotConfig_Inside, LandSlope_Mod, LandSlope_Sev, Neighborhood_Blueste, Neighborhood_BrDale, Neighborhood_BrkSide, Neighborhood_ClearCr, Neighborhood_CollgCr, Neighborhood_Crawfor, Neighborhood_Edwards, Neighborhood_Gilbert, Neighborhood_IDOTRR, Neighborhood_MeadowV, 

In [None]:
y = train_data['SalePrice']
X = train_data.drop(['SalePrice', 'Id'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Choix des modèles

In [None]:
!pip install xgboost
!pip install CatBoost

import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor


Collecting CatBoost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: CatBoost
Successfully installed CatBoost-1.2.7


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Add
from tensorflow.keras.callbacks import EarlyStopping

model_homemade = Sequential([
    Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(512, activation='relu'),
    Dropout(0.4),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dropout(0.1),
    Dense(8, activation='relu'),
    Dense(1)
])

model_homemade.compile(optimizer='adam', loss='mse', metrics=['mae'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
models = {
    "Notre modèle": model_homemade,
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "Support Vector Regression": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "LightGBM": lgb.LGBMRegressor(objective='regression'),
    "CatBoost": CatBoostRegressor(verbose=False) # Pour enlever les logs
}

In [None]:
model_performance = {}

for model_name, model in models.items():
    # On entraîane chaque modèle
    model.fit(X_train, y_train)

    # On calcule le MAE et le RMSE pour chaque modèle
    preds = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    model_performance[model_name] = {'MAE': mae, 'RMSE': rmse}

for model_name, performance in model_performance.items():
    print(f"{model_name}:")
    print(f"  MAE: {performance['MAE']}")
    print(f"  RMSE: {performance['RMSE']}")


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - loss: 35492364288.0000 - mae: 163464.0625
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001507 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3130
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 149
[LightGBM] [Info] Start training from score 181441.541952
Notre modèle:
  MAE: 100853.359375
  RMSE: 127169.90033809101
Linear Regression:
  MAE: 20535.78962215432
  RMSE: 52200.261201205096
Random Forest:
  MAE: 17661.89366438356
  RMSE: 28918.04539203667
Gradient Boosting:
  MAE: 17417.378224566564
  RMSE: 27650.180467442427
XGBoost:
  MAE: 17330.080078125
  RMSE: 28951.635256061098
Support Vector Regression:
  MAE: 59556.72803222618
  RMSE: 88653.0

## GradBoostRegressor

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
gbm = GradientBoostingRegressor()

# Grille d'hyperparamètres
param = {
    'n_estimators': [100, 200, 300,500,1000, 1500, 2000],
    'learning_rate': [0.001, 0.01, 0.1, 0.05, 0.5, 0.04, 0.06],
    'max_depth': list(np.arange(3, 31, 2)),
    'min_samples_split': [2, 5, 10, 20, 30, 50],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 4, 8],
    'loss': ['squared_error', 'absolute_error', 'huber'],
}

best_model_gbm = RandomizedSearchCV(gbm, param_distributions=param, cv=10, n_jobs=-1, random_state=20, scoring='r2')
best_model_gbm.fit(X, y)

print("Best Parameters:", best_model_gbm.best_params_)
print("Best Score:", best_model_gbm.best_score_)
best_model_gbm = best_model_gbm.best_estimator_

Best Parameters: {'subsample': 0.9, 'n_estimators': 1500, 'min_samples_split': 30, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 9, 'loss': 'squared_error', 'learning_rate': 0.01}
Best Score: 0.8964279092959601


In [None]:
best_model_gbm.fit(X_train, y_train)

In [None]:
best_model_gbm.fit(X_train, y_train)

preds = best_model_gbm.predict(X_valid)

mae = mean_absolute_error(y_valid, preds)
print("Mean Absolute Error:", mae)
rmse = np.sqrt(mean_squared_error(y_valid, preds))
print("Root Mean Squared Error:", rmse)

## CatBoost

In [None]:
catboost_model = CatBoostRegressor(verbose=False)

param_catboost = {
    'iterations': [100, 200, 300, 500, 1000, 1500, 2000],
    'learning_rate': [0.001, 0.01, 0.1, 0.05, 0.5, 0.04, 0.06],
    'depth': list(np.arange(3, 31, 2)),
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128, 254],
    'random_strength': [1, 2, 3, 4, 5],
    'bagging_temperature': [0, 0.5, 1],
    'od_type': ['EBS', 'IncToDec'],
    'od_wait': [20, 50, 100],
}

best_model_catboost = RandomizedSearchCV(
    catboost_model,
    param_distributions=param_catboost,
    cv=10,
    n_jobs=-1,
    random_state=20,
    scoring='r2'
)
best_model_catboost.fit(X, y)

print("Best Parameters (CatBoost):", best_model_catboost.best_params_)
print("Best Score (CatBoost):", best_model_catboost.best_score_)

best_model_catboost = best_model_catboost.best_estimator_

In [None]:
best_model_catboost.fit(X_train, y_train)

preds = best_model_catboost.predict(X_valid)

mae = mean_absolute_error(y_valid, preds)
print("Mean Absolute Error:", mae)
rmse = np.sqrt(mean_squared_error(y_valid, preds))
print("Root Mean Squared Error:", rmse)

# XGBoost

In [None]:
xgb_model = XGBRegressor()

param_xgboost = {
    'n_estimators': [100, 200, 300, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.5, 1],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1]
}

best_model_xgboost = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_xgboost,
    cv=10,
    n_jobs=-1,
    random_state=20,
    scoring='r2'
)
best_model_xgboost.fit(X, y)

print("Best Parameters (XGBoost):", best_model_xgboost.best_params_)
print("Best Score (XGBoost):", best_model_xgboost.best_score_)

best_model_xgboost = best_model_xgboost.best_estimator_


In [None]:
best_model_xgboost.fit(X_train, y_train)

preds = best_model_xgboost.predict(X_valid)

mae = mean_absolute_error(y_valid, preds)
print("Mean Absolute Error:", mae)
rmse = np.sqrt(mean_squared_error(y_valid, preds))
print("Root Mean Squared Error:", rmse)

# Estimations sur le dataset de test

In [None]:
test_data = pd.read_csv('test.csv')
test_data = test_data.drop(columns_to_drop, axis=1, errors='ignore')
test_data = pd.get_dummies(test_data, drop_first=True)

In [None]:
X_test = test_data.reindex(columns=X.columns, fill_value=0)

# Predict on the test data
predictions = best_model_catboost.predict(X_test)

In [None]:
output = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': predictions})
output.to_csv('submission_cat.csv', index=False)
print("Predictions saved to submission_cat.csv")


# Méthode Stacking

In [None]:
Méthode Stacking :

from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt


drive.mount('/content/drive')


data = pd.read_csv("/content/drive/MyDrive/IAetapplications/train.csv")


# Gestion des valeurs manquantes
data.fillna(data.median(numeric_only=True), inplace=True)
data.fillna(data.mode().iloc[0], inplace=True)


# Encodage des variables catégoriques
data = pd.get_dummies(data, drop_first=True)
data['SalePrice'] = np.log1p(data['SalePrice'])


X = data.drop(columns=['SalePrice', 'Id'])
y = data['SalePrice']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)


rf = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=42)
rf.fit(X_train, y_train)


xgb = XGBRegressor(n_estimators=200, max_depth=8, learning_rate=0.05, subsample=0.8, random_state=42)
xgb.fit(X_train, y_train)


model = Sequential([
    Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(), Dropout(0.4),
    Dense(512, activation='relu'), BatchNormalization(), Dropout(0.4),
    Dense(256, activation='relu'), BatchNormalization(), Dropout(0.3),
    Dense(128, activation='relu'), BatchNormalization(), Dropout(0.3),
    Dense(64, activation='relu'), Dropout(0.2),
    Dense(1)
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=100, batch_size=64, verbose=1)


rf_preds_train = rf.predict(X_train)
xgb_preds_train = xgb.predict(X_train)
nn_preds_train = model.predict(X_train).flatten()


rf_preds_valid = rf.predict(X_valid)
xgb_preds_valid = xgb.predict(X_valid)
nn_preds_valid = model.predict(X_valid).flatten()


stacked_train = np.vstack([rf_preds_train, xgb_preds_train, nn_preds_train]).T
stacked_valid = np.vstack([rf_preds_valid, xgb_preds_valid, nn_preds_valid]).T


meta_model = Ridge(alpha=1.0)
meta_model.fit(stacked_train, y_train)


stacked_preds_valid = meta_model.predict(stacked_valid)


#Echelle originale
predicted_prices = np.expm1(stacked_preds_valid)
real_prices = np.expm1(y_valid)


mae = mean_absolute_error(real_prices, predicted_prices)
rmse = np.sqrt(mean_squared_error(real_prices, predicted_prices))


print(f"MAE : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")


plt.figure(figsize=(10, 6))
plt.scatter(real_prices, predicted_prices, alpha=0.5, label='Prédictions vs Réel')
plt.plot([real_prices.min(), real_prices.max()], [real_prices.min(), real_prices.max()], color='red', linestyle='--', label='Idéal')
plt.xlabel('Valeurs réelles (SalePrice)')
plt.ylabel('Valeurs prédites (SalePrice)')
plt.title('Comparaison des valeurs réelles et prédites (Validation)')
plt.legend()
plt.show()
