In [28]:
import pandas as pd
import seaborn as sns
import numpy as np 
import matplotlib.pyplot as plt 
import plotly.express as ptx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, make_scorer

In [3]:
df = pd.read_csv('Cornescu_Darius_new_features.csv')

In [4]:
df.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,SalePrice,LotArea_m^2,Price_per_m^2,ZoningScore,SubClassScore,LotAreaNorm,PDI,Price_per_m^2_norm,EVI
count,1168.0,951.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,56.849315,70.343849,10689.642123,181441.541952,993.099822,224.816818,0.724731,0.299207,0.043888,0.392821,0.207414,0.380968
std,42.531862,24.897021,10759.366198,77263.583862,999.577398,131.121509,0.191092,0.223852,0.05029,0.090498,0.129599,0.103603
min,20.0,21.0,1300.0,34900.0,120.7739,14.966588,0.0,0.105263,0.0,0.056594,0.0,0.047745
25%,20.0,59.0,7587.25,130000.0,704.878287,148.742572,0.795605,0.105263,0.029387,0.360931,0.132223,0.329611
50%,50.0,70.0,9600.0,165000.0,891.8688,191.302979,0.795605,0.263158,0.038795,0.378316,0.174289,0.370041
75%,70.0,80.0,11700.0,214925.0,1086.9651,253.468459,0.795605,0.368421,0.048611,0.432743,0.235733,0.413595
max,190.0,313.0,215245.0,745000.0,19996.906235,1026.711897,1.0,1.0,1.0,0.84731,1.0,0.90578


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MSSubClass          1168 non-null   int64  
 1   MSZoning            1168 non-null   object 
 2   LotFrontage         951 non-null    float64
 3   LotArea             1168 non-null   int64  
 4   Street              1168 non-null   object 
 5   Alley               74 non-null     object 
 6   LotShape            1168 non-null   object 
 7   LandContour         1168 non-null   object 
 8   Utilities           1168 non-null   object 
 9   LotConfig           1168 non-null   object 
 10  LandSlope           1168 non-null   object 
 11  Neighborhood        1168 non-null   object 
 12  Condition1          1168 non-null   object 
 13  Condition2          1168 non-null   object 
 14  SalePrice           1168 non-null   int64  
 15  LotArea_m^2         1168 non-null   float64
 16  Price_

In [6]:
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

LotFrontage     217
Alley          1094
dtype: int64


In [7]:
df= df.drop(columns=['Alley'])
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'SalePrice', 'LotArea_m^2',
       'Price_per_m^2', 'ZoningScore', 'SubClassScore', 'LotAreaNorm', 'PDI',
       'Price_per_m^2_norm', 'EVI'],
      dtype='object')

In [8]:
df['LotFrontage'] = df.groupby(['Neighborhood'])['LotFrontage'].transform(
    lambda x: x.fillna(x.mean())
)

In [9]:
df.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,SalePrice,LotArea_m^2,Price_per_m^2,ZoningScore,SubClassScore,LotAreaNorm,PDI,Price_per_m^2_norm,EVI
count,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,56.849315,71.036142,10689.642123,181441.541952,993.099822,224.816818,0.724731,0.299207,0.043888,0.392821,0.207414,0.380968
std,42.531862,22.871579,10759.366198,77263.583862,999.577398,131.121509,0.191092,0.223852,0.05029,0.090498,0.129599,0.103603
min,20.0,21.0,1300.0,34900.0,120.7739,14.966588,0.0,0.105263,0.0,0.056594,0.0,0.047745
25%,20.0,60.0,7587.25,130000.0,704.878287,148.742572,0.795605,0.105263,0.029387,0.360931,0.132223,0.329611
50%,50.0,70.462865,9600.0,165000.0,891.8688,191.302979,0.795605,0.263158,0.038795,0.378316,0.174289,0.370041
75%,70.0,80.0,11700.0,214925.0,1086.9651,253.468459,0.795605,0.368421,0.048611,0.432743,0.235733,0.413595
max,190.0,313.0,215245.0,745000.0,19996.906235,1026.711897,1.0,1.0,1.0,0.84731,1.0,0.90578


In [10]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

if 'SalePrice' in numeric_cols:
    numeric_cols.remove('SalePrice')

print(f"\nVariabile numerice ({len(numeric_cols)}): {numeric_cols[:10]}")
print(f"Variabile categorice ({len(categorical_cols)}): {categorical_cols}")


Variabile numerice (11): ['MSSubClass', 'LotFrontage', 'LotArea', 'LotArea_m^2', 'Price_per_m^2', 'ZoningScore', 'SubClassScore', 'LotAreaNorm', 'PDI', 'Price_per_m^2_norm']
Variabile categorice (10): ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2']


In [11]:
for col in categorical_cols:
    n_unique = df[col].nunique()
    print(f"{col}: {n_unique} categorii unice")

MSZoning: 5 categorii unice
Street: 2 categorii unice
LotShape: 4 categorii unice
LandContour: 4 categorii unice
Utilities: 2 categorii unice
LotConfig: 5 categorii unice
LandSlope: 3 categorii unice
Neighborhood: 25 categorii unice
Condition1: 9 categorii unice
Condition2: 8 categorii unice


In [12]:
max_categories = 10
cols_to_drop= [col for col in categorical_cols if df[col].nunique() > max_categories]
print(f"\nColoane eliminate: {cols_to_drop}")


Coloane eliminate: ['Neighborhood']


In [13]:
df_subset = df.copy()
df_subset = df_subset.drop(columns=cols_to_drop)
categorical_cols = [col for col in categorical_cols if col not in cols_to_drop]
print(df_subset.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Condition1', 'Condition2', 'SalePrice', 'LotArea_m^2', 'Price_per_m^2',
       'ZoningScore', 'SubClassScore', 'LotAreaNorm', 'PDI',
       'Price_per_m^2_norm', 'EVI'],
      dtype='object')


In [14]:
df_subset_encoded = pd.get_dummies(df_subset, columns=categorical_cols, drop_first=True)

print(f"Coloane după encoding: {df_subset_encoded.shape[1]}")
print(f"Noi coloane create: {df_subset_encoded.shape[1] - df_subset.shape[1]}")

Coloane după encoding: 45
Noi coloane create: 24


Am ales să folosesc modelul Decision Tree, fiindcă datele nu prezintă o relație liniară clară. Arborii de decizie pot surprinde mai bine legături neliniare și pot oferi rezultate mai bune în astfel de situații.

In [15]:
correlations = df_subset_encoded.corr()['SalePrice'].abs().sort_values(ascending=False)
print(correlations.head(15))

SalePrice             1.000000
LotFrontage           0.322751
ZoningScore           0.310521
EVI                   0.299640
MSZoning_RM           0.279680
LotArea               0.266204
LotArea_m^2           0.266204
LotAreaNorm           0.266204
Price_per_m^2         0.265210
Price_per_m^2_norm    0.265210
LotShape_Reg          0.251103
PDI                   0.241289
MSZoning_RL           0.224214
LotConfig_CulDSac     0.150660
LandContour_HLS       0.144847
Name: SalePrice, dtype: float64


In [16]:
trashhold = 0.1

in_use_features = correlations[correlations > trashhold].index.to_list()
in_use_features.remove('SalePrice')

print(f"\nNumăr de variabile selectate (corelație > {trashhold}): {len(in_use_features)}")
print(f"Variabile selectate: {in_use_features[:10]}")


Număr de variabile selectate (corelație > 0.1): 17
Variabile selectate: ['LotFrontage', 'ZoningScore', 'EVI', 'MSZoning_RM', 'LotArea', 'LotArea_m^2', 'LotAreaNorm', 'Price_per_m^2', 'Price_per_m^2_norm', 'LotShape_Reg']


In [17]:
X = df_subset_encoded[in_use_features]
Y = df_subset_encoded['SalePrice']
print(f"\nDimensiuni finale:")
print(f"  X (features): {X.shape}")
print(f"  y (target): {Y.shape}")


Dimensiuni finale:
  X (features): (1168, 17)
  y (target): (1168,)


In [36]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=21)

In [37]:
decision_tree = DecisionTreeRegressor(max_depth=10, random_state=42, min_samples_split=8)
decision_tree.fit(X_train, Y_train)
Y_predicated = decision_tree.predict(X_test)

r2  = r2_score(Y_test, Y_predicated)
mse = mean_squared_error(Y_test, Y_predicated)
rmse= np.sqrt(mse)
mae = mean_absolute_error(Y_test, Y_predicated)

print(f"R²:   {r2:.3f}")
print(f"MSE:  {mse:,.0f}")
print(f"RMSE: {rmse:,.0f}")
print(f"MAE:  {mae:,.0f}")

R²:   0.856
MSE:  815,740,405
RMSE: 28,561
MAE:  13,717


In [38]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    "criterion": ["squared_error", "friedman_mse", "absolute_error"],
    "max_depth": [None, 3, 5, 7, 10, 12, 15],
    "min_samples_split": [2, 4, 6, 8, 10],
    "min_samples_leaf": [1, 2, 4, 6],
    "splitter": ["best", "random"]
}
scoring = {
    "R2": make_scorer(r2_score),
    "MSE": make_scorer(mean_squared_error, greater_is_better=False),  
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False)  
}

decision_tree = DecisionTreeRegressor(random_state=42)
grid = GridSearchCV(
    estimator=decision_tree,
    param_grid=param_grid,
    scoring=scoring,
    cv=5,
    refit="R2"
)

grid.fit(X_train, Y_train)

print("\nBest parameters found:")
print(grid.best_params_)


Best parameters found:
{'criterion': 'squared_error', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}


In [39]:
best_dt = grid.best_estimator_
y_pred = best_dt.predict(X_test)

r2  = r2_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
mae = mean_absolute_error(Y_test, y_pred)
rmse = np.sqrt(mse)

print(f"\nRezultate model optim:")
print(f"R²:   {r2:.3f}")
print(f"MSE:  {mse:,.0f}")
print(f"RMSE: {rmse:,.0f}")
print(f"MAE:  {mae:,.0f}")


Rezultate model optim:
R²:   0.879
MSE:  683,962,045
RMSE: 26,153
MAE:  12,612


### **7. Interpretarea rezultatelor**

Modelul **Decision Tree** s-a descurcat foarte bine încă de la început, obținând un **R² de 0.856** și o eroare medie de aproximativ **13.700 $**.

După reglarea hiperparametrilor cu **GridSearchCV**, performanța a crescut și mai mult — modelul a ajuns la un **R² de 0.879**, cu erori mai mici (RMSE = 26.000, MAE = 12.600).

Pe scurt, fine-tuning-ul a făcut modelul mai precis și mai stabil în predicții.
Cei mai importanți factori care influențează prețul au fost **LotFrontage**, **ZoningScore**, **EVI**, **LotArea** și **LotShape_Reg** — practic, contează cel mai mult mărimea terenului și zona în care se află proprietatea.
