In [240]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [241]:
# Load the training data
train_data = pd.read_csv('house-prices-data/train.csv')

# Load the test data
test_data = pd.read_csv('house-prices-data/test.csv')

In [242]:
train_data.shape

(1460, 81)

In [243]:
test_data.shape

(1459, 80)

In [244]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [245]:
def replace_na(df):
    # Parcourir chaque colonne du DataFrame
    for col in df.columns:
        # Si le type de la colonne est int64 ou float64
        if df[col].dtype in ['int64', 'float64']:
            df[col].fillna(0, inplace=True)  # Remplacer NaN par 0
        else:
            df[col].fillna('None', inplace=True)  # Remplacer NaN par 'None'


In [246]:

replace_na(train_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)  # Remplacer NaN par 0
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('None', inplace=True)  # Remplacer NaN par 'None'


In [247]:
replace_na(test_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)  # Remplacer NaN par 0
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('None', inplace=True)  # Remplacer NaN par 'None'


In [248]:
train_data.shape

(1460, 81)

In [249]:
test_data.shape

(1459, 80)

In [250]:
ids = test_data['Id']

ids.head()

0    1461
1    1462
2    1463
3    1464
4    1465
Name: Id, dtype: int64

In [251]:
train_data.drop('Id', axis=1, inplace=True)

In [252]:
test_data.drop('Id', axis=1, inplace=True)

In [253]:
test_data.shape

(1459, 79)

In [254]:
y = train_data['SalePrice']

X = train_data.drop('SalePrice', axis=1)

In [255]:
X.shape

(1460, 79)

In [256]:
def preprocess_data(df):
    # Séparer les colonnes numériques et non numériques
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    non_num_cols = df.select_dtypes(exclude=['int64', 'float64']).columns

    # 1. Normaliser les colonnes numériques
    scaler = StandardScaler()  # Vous pouvez utiliser MinMaxScaler() si vous préférez
    df[num_cols] = scaler.fit_transform(df[num_cols])

    # 2. Appliquer One-Hot Encoding sur les colonnes non numériques
    df = pd.get_dummies(df, columns=non_num_cols, drop_first=True)

    # 3. Remplacer True par 1 et False par 0 dans toutes les colonnes booléennes
    df = df.applymap(lambda x: 1 if x is True else (0 if x is False else x))

    return df


In [257]:
# Appliquer la fonction sur le DataFrame train_data
X = preprocess_data(X)

# Afficher les premières lignes du DataFrame après transformation
X.head()


  df = df.applymap(lambda x: 1 if x is True else (0 if x is False else x))


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.073375,0.212877,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,...,0,0,0,0,1,0,0,0,1,0
1,-0.872563,0.645747,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,...,0,0,0,0,1,0,0,0,1,0
2,0.073375,0.299451,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,...,0,0,0,0,1,0,0,0,1,0
3,0.309859,0.068587,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,-0.288653,...,0,0,0,0,1,0,0,0,0,0
4,0.073375,0.761179,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,-0.288653,...,0,0,0,0,1,0,0,0,1,0


In [258]:
test_data = preprocess_data(test_data)

test_data.head()

  df = df.applymap(lambda x: 1 if x is True else (0 if x is False else x))


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_None,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.874711,0.684849,0.363929,-0.751101,0.400766,-0.340945,-1.072885,-0.563316,0.063936,0.517537,...,0,0,0,0,1,0,0,0,1,0
1,-0.874711,0.715852,0.897861,-0.054877,0.400766,-0.439695,-1.214908,0.047057,1.063714,-0.297689,...,0,0,0,0,1,0,0,0,1,0
2,0.061351,0.498831,0.809646,-0.751101,-0.497418,0.844059,0.678742,-0.563316,0.773668,-0.297689,...,0,0,0,0,1,0,0,0,1,0
3,0.061351,0.622843,0.032064,-0.054877,0.400766,0.876976,0.678742,-0.450284,0.358376,-0.297689,...,0,0,0,0,1,0,0,0,1,0
4,1.465443,-0.462261,-0.971808,1.337571,-0.497418,0.679475,0.394694,-0.563316,-0.386513,-0.297689,...,0,0,0,0,1,0,0,0,1,0


In [259]:

def align_columns(df1, df2):
    # Trouver les colonnes qui sont dans df1 mais pas dans df2
    missing_in_df2 = df1.columns.difference(df2.columns)
    # Trouver les colonnes qui sont dans df2 mais pas dans df1
    missing_in_df1 = df2.columns.difference(df1.columns)
    
    # Ajouter les colonnes manquantes dans df2, initialisées à 0
    for col in missing_in_df2:
        df2[col] = 0
    
    # Ajouter les colonnes manquantes dans df1, initialisées à 0
    for col in missing_in_df1:
        df1[col] = 0
    
    # Réorganiser les colonnes pour que df1 et df2 aient les mêmes colonnes dans le même ordre
    df2 = df2[df1.columns]
    
    return df1, df2


In [260]:

# Exemple d'utilisation avec train_data et test_data (ou df1 et df2)
train_data, test_data = align_columns(X, test_data)

In [261]:
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Exterior1st_None,Exterior2nd_None,Functional_None,KitchenQual_None,MSZoning_None,SaleType_None,Utilities_None
0,0.073375,0.212877,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,...,0,1,0,0,0,0,0,0,0,0
1,-0.872563,0.645747,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,...,0,1,0,0,0,0,0,0,0,0
2,0.073375,0.299451,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,...,0,1,0,0,0,0,0,0,0,0
3,0.309859,0.068587,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,-0.288653,...,0,0,0,0,0,0,0,0,0,0
4,0.073375,0.761179,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,-0.288653,...,0,1,0,0,0,0,0,0,0,0


In [262]:
test_data.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,Exterior1st_None,Exterior2nd_None,Functional_None,KitchenQual_None,MSZoning_None,SaleType_None,Utilities_None
0,-0.874711,0.684849,0.363929,-0.751101,0.400766,-0.340945,-1.072885,-0.563316,0.063936,0.517537,...,0,1,0,0,0,0,0,0,0,0
1,-0.874711,0.715852,0.897861,-0.054877,0.400766,-0.439695,-1.214908,0.047057,1.063714,-0.297689,...,0,1,0,0,0,0,0,0,0,0
2,0.061351,0.498831,0.809646,-0.751101,-0.497418,0.844059,0.678742,-0.563316,0.773668,-0.297689,...,0,1,0,0,0,0,0,0,0,0
3,0.061351,0.622843,0.032064,-0.054877,0.400766,0.876976,0.678742,-0.450284,0.358376,-0.297689,...,0,1,0,0,0,0,0,0,0,0
4,1.465443,-0.462261,-0.971808,1.337571,-0.497418,0.679475,0.394694,-0.563316,-0.386513,-0.297689,...,0,1,0,0,0,0,0,0,0,0


In [263]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [264]:
model = LinearRegression()
model.fit(X_train, y_train)

In [265]:
y_pred = model.predict(X_test)


In [266]:
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5  # Racine carrée de l'erreur quadratique moyenne

In [267]:

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Squared Error (MSE): 1.5959072141762173e+27
Root Mean Squared Error (RMSE): 39948807418697.96


In [268]:
predictions = model.predict(test_data)

In [269]:
predictions

array([ 8.68398260e+14, -1.95888568e+14,  1.36462498e+14, ...,
        6.24567051e+13,  4.68571135e+14, -2.07020449e+14])