In [138]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

import shap

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.compose import ColumnTransformer

In [139]:
df = pd.read_csv('../house_prices_selection.csv')
print(df.shape)
df.head()


(1460, 32)


Unnamed: 0,EVI,ZoningScore,MSZoning_RL,LotConfig_CulDSac,LandContour_HLS,LotShape_IR2,Condition1_Norm,Condition1_Feedr,LotShape_Reg,MSZoning_RM,...,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,GrLivArea,TotalBsmtSF,1stFlrSF,BsmtQual,CentralAir,Foundation
0,0.43753,0.835044,True,False,False,False,True,False,True,False,...,7,2003,2003,196.0,1710,856,856,Gd,Y,PConc
1,0.364741,0.835044,True,False,False,False,False,True,True,False,...,6,1976,1976,0.0,1262,1262,1262,Gd,Y,CBlock
2,0.411955,0.835044,True,False,False,False,True,False,False,False,...,7,2001,2002,162.0,1786,920,920,Gd,Y,PConc
3,0.394783,0.835044,True,False,False,False,True,False,False,False,...,7,1915,1970,0.0,1717,756,961,TA,Y,BrkTil
4,0.399533,0.835044,True,False,False,False,True,False,False,False,...,8,2000,2000,350.0,2198,1145,1145,Gd,Y,PConc


In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   EVI                1460 non-null   float64
 1   ZoningScore        1460 non-null   float64
 2   MSZoning_RL        1460 non-null   bool   
 3   LotConfig_CulDSac  1460 non-null   bool   
 4   LandContour_HLS    1460 non-null   bool   
 5   LotShape_IR2       1460 non-null   bool   
 6   Condition1_Norm    1460 non-null   bool   
 7   Condition1_Feedr   1460 non-null   bool   
 8   LotShape_Reg       1460 non-null   bool   
 9   MSZoning_RM        1460 non-null   bool   
 10  Price_per_m^2      1460 non-null   float64
 11  PDI                1460 non-null   float64
 12  SalePrice          1460 non-null   int64  
 13  LotFrontage        1201 non-null   float64
 14  LotArea            1460 non-null   int64  
 15  WoodDeckSF         1460 non-null   int64  
 16  OpenPorchSF        1460 

In [141]:
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
boolean_features = df.select_dtypes(include=['bool']).columns.tolist()

print(f"Coloane numerice ({len(numerical_features)}): {numerical_features}")
print(f"\nColoane categorice ({len(categorical_features)}): {categorical_features}")   
print(f"\nColoane booleene ({len(boolean_features)}): {boolean_features}")

Coloane numerice (19): ['EVI', 'ZoningScore', 'Price_per_m^2', 'PDI', 'SalePrice', 'LotFrontage', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'FullBath', 'GarageCars', 'GarageArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF']

Coloane categorice (5): ['KitchenQual', 'PavedDrive', 'BsmtQual', 'CentralAir', 'Foundation']

Coloane booleene (8): ['MSZoning_RL', 'LotConfig_CulDSac', 'LandContour_HLS', 'LotShape_IR2', 'Condition1_Norm', 'Condition1_Feedr', 'LotShape_Reg', 'MSZoning_RM']


In [142]:
numerical_features.remove('SalePrice') 
print(f"Coloane numerice ({len(numerical_features)}): {numerical_features}")
print(f"\nColoane categorice ({len(categorical_features)}): {categorical_features}")

Coloane numerice (18): ['EVI', 'ZoningScore', 'Price_per_m^2', 'PDI', 'LotFrontage', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'FullBath', 'GarageCars', 'GarageArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF']

Coloane categorice (5): ['KitchenQual', 'PavedDrive', 'BsmtQual', 'CentralAir', 'Foundation']


In [143]:
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Percentage': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
print(f"\nValori lipsă în dataset:")
print(missing_df)


Valori lipsă în dataset:
         Column  Missing_Count  Percentage
13  LotFrontage            259   17.739726
29     BsmtQual             37    2.534247
25   MasVnrArea              8    0.547945


In [144]:
df_test = pd.read_csv('../data/Cornescu_Darius_selected_features.csv')
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SalePrice          1460 non-null   int64  
 1   LotFrontage        1460 non-null   float64
 2   ZoningScore        1460 non-null   float64
 3   EVI                1460 non-null   float64
 4   Price_per_m^2      1460 non-null   float64
 5   LotArea            1460 non-null   int64  
 6   PDI                1460 non-null   float64
 7   MSZoning_RL        1460 non-null   bool   
 8   LotConfig_CulDSac  1460 non-null   bool   
 9   LotShape_IR2       1460 non-null   bool   
 10  LandContour_HLS    1460 non-null   bool   
 11  Condition1_Norm    1460 non-null   bool   
 12  Condition1_Feedr   1460 non-null   bool   
 13  LotShape_Reg       1460 non-null   bool   
 14  MSZoning_RM        1460 non-null   bool   
dtypes: bool(8), float64(5), int64(2)
memory usage: 91.4 KB


In [145]:
missing_values = df_test.isnull().sum()
missing_percent = (missing_values / len(df_test)) * 100
missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Percentage': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
print(f"\nValori lipsă în dataset:")
print(missing_df)


Valori lipsă în dataset:
Empty DataFrame
Columns: [Column, Missing_Count, Percentage]
Index: []


In [146]:
for cols in boolean_features:
    df[cols] = df[cols].astype(int)

numerical_features.extend(boolean_features)
print(f"Coloane numerice (inclusiv boolean) ({len(numerical_features)}): {numerical_features}")
print(f"\nColoane categorice ({len(categorical_features)}): {categorical_features}")

Coloane numerice (inclusiv boolean) (26): ['EVI', 'ZoningScore', 'Price_per_m^2', 'PDI', 'LotFrontage', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'FullBath', 'GarageCars', 'GarageArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF', 'MSZoning_RL', 'LotConfig_CulDSac', 'LandContour_HLS', 'LotShape_IR2', 'Condition1_Norm', 'Condition1_Feedr', 'LotShape_Reg', 'MSZoning_RM']

Coloane categorice (5): ['KitchenQual', 'PavedDrive', 'BsmtQual', 'CentralAir', 'Foundation']


In [147]:
X = df.drop('SalePrice', axis=1)
Y = df['SalePrice']
print(f"Features shape: {X.shape}")
print(f"Target shape: {Y.shape}")

Features shape: (1460, 31)
Target shape: (1460,)


In [148]:
df_clean = df.dropna()
X_clean = df_clean.drop('SalePrice', axis=1)
Y_clean = df_clean['SalePrice']
print(f"\nDataset după eliminarea valorilor lipsă: {df_clean.shape}")
print(f"Rânduri eliminate: {len(df) - len(df_clean)}")


Dataset după eliminarea valorilor lipsă: (1164, 32)
Rânduri eliminate: 296


In [149]:
numeric_features_clean = X_clean.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features_clean = X_clean.select_dtypes(include=['object']).columns.tolist()

In [150]:
preprocessor_dropna = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_clean),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features_clean)
    ])

pipeline_dropna = Pipeline([
    ('preprocessor', preprocessor_dropna),
    ('regressor', LinearRegression())
])

X_train_dropna, X_test_dropna, Y_train_dropna, Y_test_dropna = train_test_split( X_clean, Y_clean, test_size=0.2, random_state=42)

pipeline_dropna.fit(X_train_dropna, Y_train_dropna)
Y_pred_dropna = pipeline_dropna.predict(X_test_dropna)

mae_dropna = mean_absolute_error(Y_test_dropna, Y_pred_dropna)
rmse_dropna = np.sqrt(mean_squared_error(Y_test_dropna, Y_pred_dropna))
r2_dropna = r2_score(Y_test_dropna, Y_pred_dropna)

print(f"\nREZULTATE ABORDAREA 1 (dropna):")
print(f"MAE: ${mae_dropna:,.2f}")
print(f"RMSE: ${rmse_dropna:,.2f}")
print(f"R² Score: {r2_dropna:.4f}")




REZULTATE ABORDAREA 1 (dropna):
MAE: $21,732.42
RMSE: $35,344.64
R² Score: 0.8358


In [151]:
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=7)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor_imputer = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline_imputer = Pipeline([
    ('preprocessor', preprocessor_imputer),
    ('regressor', LinearRegression())
])

X_train_imputer, X_test_imputer, Y_train_imputer, Y_test_imputer = train_test_split( X, Y, test_size=0.2, random_state=42 )

pipeline_imputer.fit(X_train_imputer, Y_train_imputer)
y_pred_imputer = pipeline_imputer.predict(X_test_imputer)

mae_imputer = mean_absolute_error(Y_test_imputer, y_pred_imputer)
rmse_imputer = np.sqrt(mean_squared_error(Y_test_imputer, y_pred_imputer))
r2_imputer = r2_score(Y_test_imputer, y_pred_imputer)

print(f"\nREZULTATE ABORDAREA 2 (SimpleImputer):")
print(f"MAE: ${mae_imputer:,.2f}")
print(f"RMSE: ${rmse_imputer:,.2f}")
print(f"R² Score: {r2_imputer:.4f}")


REZULTATE ABORDAREA 2 (SimpleImputer):
MAE: $21,892.38
RMSE: $34,297.28
R² Score: 0.8466


**Rezultatele celor două abordări sunt apropiate, însă diferă în moduri relevante.** Abordarea 1 (dropna) oferă un MAE puțin mai bun, dar un RMSE mai slab și un scor R² mai mic. În schimb, Abordarea 2 (cu *KNNImputer*) are un MAE ușor mai mare, însă obține cel mai mic RMSE și cel mai ridicat R², indicând o capacitate mai bună de a explica variația datelor și o performanță globală mai stabilă.

**Per ansamblu, metoda cu imputare pare mai potrivită pentru acest set de date**, deoarece păstrează mai multe informații și conduce la un model cu performanță generală superioară.


# Cu o mica modificare la LotFrontage, incerc sa copiez din setul meu de date coloana sa vad daca obtin ceva diferente


In [152]:
df['LotFrontage'] = df_test['LotFrontage']
for cols in boolean_features:
    df[cols] = df[cols].astype(int)

numerical_features.extend(boolean_features)
print(f"Coloane numerice (inclusiv boolean) ({len(numerical_features)}): {numerical_features}")
print(f"\nColoane categorice ({len(categorical_features)}): {categorical_features}")

Coloane numerice (inclusiv boolean) (34): ['EVI', 'ZoningScore', 'Price_per_m^2', 'PDI', 'LotFrontage', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'FullBath', 'GarageCars', 'GarageArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF', 'MSZoning_RL', 'LotConfig_CulDSac', 'LandContour_HLS', 'LotShape_IR2', 'Condition1_Norm', 'Condition1_Feedr', 'LotShape_Reg', 'MSZoning_RM', 'MSZoning_RL', 'LotConfig_CulDSac', 'LandContour_HLS', 'LotShape_IR2', 'Condition1_Norm', 'Condition1_Feedr', 'LotShape_Reg', 'MSZoning_RM']

Coloane categorice (5): ['KitchenQual', 'PavedDrive', 'BsmtQual', 'CentralAir', 'Foundation']


In [153]:
X = df.drop('SalePrice', axis=1)
Y = df['SalePrice']
print(f"Features shape: {X.shape}")
print(f"Target shape: {Y.shape}")

Features shape: (1460, 31)
Target shape: (1460,)


In [154]:
df_clean = df.dropna()
X_clean = df_clean.drop('SalePrice', axis=1)
Y_clean = df_clean['SalePrice']
print(f"\nDataset după eliminarea valorilor lipsă: {df_clean.shape}")
print(f"Rânduri eliminate: {len(df) - len(df_clean)}")

numeric_features_clean = X_clean.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features_clean = X_clean.select_dtypes(include=['object']).columns.tolist()


Dataset după eliminarea valorilor lipsă: (1415, 32)
Rânduri eliminate: 45


In [155]:
preprocessor_dropna = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_clean),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features_clean)
    ])

pipeline_dropna = Pipeline([
    ('preprocessor', preprocessor_dropna),
    ('regressor', LinearRegression())
])

X_train_dropna, X_test_dropna, Y_train_dropna, Y_test_dropna = train_test_split( X_clean, Y_clean, test_size=0.2, random_state=42)

pipeline_dropna.fit(X_train_dropna, Y_train_dropna)
Y_pred_dropna = pipeline_dropna.predict(X_test_dropna)

mae_dropna = mean_absolute_error(Y_test_dropna, Y_pred_dropna)
rmse_dropna = np.sqrt(mean_squared_error(Y_test_dropna, Y_pred_dropna))
r2_dropna = r2_score(Y_test_dropna, Y_pred_dropna)

print(f"\nREZULTATE ABORDAREA 1 (dropna):")
print(f"MAE: ${mae_dropna:,.2f}")
print(f"RMSE: ${rmse_dropna:,.2f}")
print(f"R² Score: {r2_dropna:.4f}")




REZULTATE ABORDAREA 1 (dropna):
MAE: $20,366.21
RMSE: $32,530.50
R² Score: 0.8451


In [156]:
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=7)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor_imputer = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

pipeline_imputer = Pipeline([
    ('preprocessor', preprocessor_imputer),
    ('regressor', LinearRegression())
])

X_train_imputer, X_test_imputer, Y_train_imputer, Y_test_imputer = train_test_split( X, Y, test_size=0.2, random_state=42 )

pipeline_imputer.fit(X_train_imputer, Y_train_imputer)
y_pred_imputer = pipeline_imputer.predict(X_test_imputer)

mae_imputer = mean_absolute_error(Y_test_imputer, y_pred_imputer)
rmse_imputer = np.sqrt(mean_squared_error(Y_test_imputer, y_pred_imputer))
r2_imputer = r2_score(Y_test_imputer, y_pred_imputer)

print(f"\nREZULTATE ABORDAREA 2 (SimpleImputer):")
print(f"MAE: ${mae_imputer:,.2f}")
print(f"RMSE: ${rmse_imputer:,.2f}")
print(f"R² Score: {r2_imputer:.4f}")


REZULTATE ABORDAREA 2 (SimpleImputer):
MAE: $21,900.55
RMSE: $34,308.36
R² Score: 0.8465
