In [2]:
import pandas as pd

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
test["SalePrice"] = None

In [8]:
combined = pd.concat([train, test], axis=0).reset_index(drop=True)

In [10]:
combined.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [12]:
combined.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       486
LotArea             0
                 ... 
MoSold              0
YrSold              0
SaleType            1
SaleCondition       0
SalePrice        1459
Length: 81, dtype: int64

In [14]:
combined.dtypes.value_counts()

object     44
int64      26
float64    11
Name: count, dtype: int64

In [29]:
missing_counts = combined.isnull().sum()
missing_counts[missing_counts > 0].sort_values(ascending=False)

SalePrice       1459
LotFrontage      486
GarageYrBlt      159
MasVnrArea        23
MSZoning           4
BsmtFullBath       2
BsmtHalfBath       2
Functional         2
Utilities          2
Exterior2nd        1
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Exterior1st        1
KitchenQual        1
GarageCars         1
GarageArea         1
SaleType           1
Electrical         1
dtype: int64

In [31]:
none_fill_cols = [
    'PoolQC',       # Havuz kalitesi (havuz yoksa boş)
    'MiscFeature',  # Ekstra özellik (yoksa boş)
    'Alley',        # Arka sokak girişi (olmayan ev çok)
    'Fence',        # Çit (yoksa boş)
    'FireplaceQu',  # Şömine kalitesi (şömine yoksa boş)
    'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',  # Garaj özellikleri
    'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',  # Bodrum özellikleri
    'MasVnrType',   # Taş kaplama türü
]

for col in none_fill_cols:
    combined[col] = combined[col].fillna("None")

In [33]:
zero_fill_cols = [
    'MasVnrArea',
    'GarageYrBlt',
    'GarageArea',
    'GarageCars',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF'
]

for col in zero_fill_cols:
    combined[col] = combined[col].fillna(0)

In [37]:
mode_fill_cols = ['Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional', 'Utilities']

for col in mode_fill_cols:
    mode = combined[col].mode()[0] 
    combined[col] = combined[col].fillna(mode)

In [39]:
combined["LotFrontage"] = combined.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)

In [49]:
combined.isnull().sum().sort_values(ascending=False).head()

SalePrice       1459
MSZoning           4
BsmtFullBath       2
BsmtHalfBath       2
KitchenAbvGr       0
dtype: int64

In [51]:
combined.dtypes.value_counts()

object     44
int64      26
float64    11
Name: count, dtype: int64

In [53]:
combined["MSZoning"] = combined["MSZoning"].fillna(combined["MSZoning"].mode()[0])

In [55]:
combined["BsmtFullBath"] = combined["BsmtFullBath"].fillna(0)
combined["BsmtHalfBath"] = combined["BsmtHalfBath"].fillna(0)

In [57]:
combined.isnull().sum().sort_values(ascending=False).head()

SalePrice      1459
CentralAir        0
GarageYrBlt       0
GarageType        0
FireplaceQu       0
dtype: int64

In [59]:
combined["TotalBathrooms"] = (
    combined["FullBath"] +
    (0.5 * combined["HalfBath"]) +
    combined["BsmtFullBath"] +
    (0.5 * combined["BsmtHalfBath"])
)

In [61]:
combined["TotalSF"] = (
    combined["TotalBsmtSF"] +
    combined["1stFlrSF"] +
    combined["2ndFlrSF"]
)

In [63]:
combined["HasGarage"] = combined["GarageArea"].apply(lambda x: 1 if x > 0 else 0)

In [65]:
combined["IsRemodeled"] = (combined["YearBuilt"] != combined["YearRemodAdd"]).astype(int)

In [67]:
combined["AgeOfHouse"] = combined["YrSold"] - combined["YearBuilt"]

In [69]:
combined[["TotalBathrooms", "TotalSF", "HasGarage", "IsRemodeled", "AgeOfHouse"]].head()

Unnamed: 0,TotalBathrooms,TotalSF,HasGarage,IsRemodeled,AgeOfHouse
0,3.5,2566.0,1,0,5
1,2.5,2524.0,1,0,31
2,3.5,2706.0,1,1,7
3,2.0,2473.0,1,1,91
4,3.5,3343.0,1,0,8


In [75]:
train_data = combined[combined["SalePrice"].notnull()]
combined["SalePrice"] = pd.to_numeric(combined["SalePrice"])

In [79]:
train_data = combined[combined["SalePrice"].notnull()]
correlation_matrix = train_data.corr(numeric_only=True)
correlation_with_price = correlation_matrix["SalePrice"].sort_values(ascending=False)
correlation_with_price.head(15)

SalePrice         1.000000
OverallQual       0.790982
TotalSF           0.782260
GrLivArea         0.708624
GarageCars        0.640409
TotalBathrooms    0.631731
GarageArea        0.623431
TotalBsmtSF       0.613581
1stFlrSF          0.605852
FullBath          0.560664
TotRmsAbvGrd      0.533723
YearBuilt         0.522897
YearRemodAdd      0.507101
MasVnrArea        0.472614
Fireplaces        0.466929
Name: SalePrice, dtype: float64

In [81]:
cat_cols = combined.select_dtypes(include="object").columns.tolist()

In [83]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_cols:
    if combined[col].nunique() <= 10:
        combined[col] = le.fit_transform(combined[col].astype(str))

In [85]:
combined = pd.get_dummies(combined, columns=[col for col in cat_cols if combined[col].nunique() > 10])

In [87]:
combined.dtypes.value_counts()

int64      69
bool       56
float64    14
Name: count, dtype: int64

In [89]:
train_data = combined[combined["SalePrice"].notnull()].copy()
test_data = combined[combined["SalePrice"].isnull()].drop("SalePrice", axis=1).copy()

In [91]:
X = train_data.drop("SalePrice", axis=1)
y = train_data["SalePrice"]

In [93]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [99]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [100]:
y_pred_rf = rf_model.predict(X_valid)

mae_rf = mean_absolute_error(y_valid, y_pred_rf)
mse_rf = mean_squared_error(y_valid, y_pred_rf)
r2_rf = r2_score(y_valid, y_pred_rf)


In [101]:
mae_rf, mse_rf, r2_rf

(17898.43065068493, 907789729.6915044, 0.8816491365333456)

In [105]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

In [107]:
y_pred_gb = gb_model.predict(X_valid)

mae_gb = mean_absolute_error(y_valid, y_pred_gb)
mse_gb = mean_squared_error(y_valid, y_pred_gb)
r2_gb = r2_score(y_valid, y_pred_gb)

In [109]:
mae_gb, mse_gb, r2_gb

(16450.313131726773, 753064237.0279987, 0.9018210938248787)

In [111]:
final_predictions = gb_model.predict(test_data)

In [113]:
original_test = pd.read_csv("test.csv")

submission = pd.DataFrame({
    "Id": original_test["Id"],
    "SalePrice": final_predictions
})

In [115]:
submission.to_csv("submission.csv", index=False)