In [66]:
import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/SudeKaraca228/turkiye-house-price-prediction/refs/heads/main/data/raw/home_price.csv"
df = pd.read_csv(url)

print(df)



       Net_Metrekare  Brüt_Metrekare  Oda_Sayısı      Bulunduğu_Kat  \
0                120           150.0         4.0              4.Kat   
1                100           125.0         4.0              3.Kat   
2                 89            95.0         3.0              4.Kat   
3                 40            55.0         2.0              6.Kat   
4                140           150.0         4.0  Düz Giriş (Zemin)   
...              ...             ...         ...                ...   
20321            130           140.0         4.0              4.Kat   
20322             95           115.0         3.0              2.Kat   
20323            125           155.0         4.0              1.Kat   
20324             50            70.0         2.0       Yüksek Giriş   
20325            110           130.0         3.0              2.Kat   

      Eşya_Durumu Binanın_Yaşı     Isıtma_Tipi      Fiyat  Şehir  \
0          Eşyalı  21 Ve Üzeri  Kombi Doğalgaz   950000.0  adana   
1          

# Data Cleaning

## Outliers and Duplicated Rows Handling 

In [67]:
import matplotlib.pyplot as plt

numerical_columns = ["Net_Metrekare", "Brüt_Metrekare", "Oda_Sayısı", "Fiyat", "Binanın_Kat_Sayısı", "Banyo_Sayısı"]
categorial_columns = ["Bulunduğu_Kat","Eşya_Durumu","Binanın_Yaşı","Isıtma_Tipi","Şehir","Kullanım_Durumu","Yatırıma_Uygunluk","Takas"]

# Here I deleted any duplicated rows    
df.drop_duplicates(keep='first', inplace=True)

df = df[df["Net_Metrekare"] < 10000]
df = df[df["Brüt_Metrekare"] < 10000]
df = df[df["Fiyat"] < 7000000000]

for col in numerical_columns:
    
    counter = 0
# Here I created a boxplot to visualize the outliers.
    df.boxplot(column = col)
    
    q1,q3 = df[col].quantile([0.25,0.75])
    iqr = q3 - q1
    limit_1 = q1-1.5*iqr
    limit_2 = q3+1.5*iqr
    # df[col] = df[col].clip(upper = df[col].quantile(0.97))
    # df[col] = df[col].clip(lower = df[col].quantile(0.03)) 
   
    for value in df[col]:
        if value < limit_1 or value > limit_2:
            counter += 1

    print(f"The number of outliers in {col} : {counter}")

     



The number of outliers in Net_Metrekare : 975
The number of outliers in Brüt_Metrekare : 1261
The number of outliers in Oda_Sayısı : 1072
The number of outliers in Fiyat : 1816
The number of outliers in Binanın_Kat_Sayısı : 1793
The number of outliers in Banyo_Sayısı : 459


## Imputation

In [68]:
from sklearn.preprocessing import MinMaxScaler , OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# I deleted the Tapu Durumu column here because it is half empty
df = df.drop("Tapu_Durumu" , axis = 1)

# I kept the encoder out of the pipeline because we have categorial, and numerical data. This might cause some data leakage but it is not very significant.
oe = OrdinalEncoder(encoded_missing_value=np.nan)

df[categorial_columns] = oe.fit_transform(df[categorial_columns])

df_train, df_test = train_test_split(df, test_size = 0.2, random_state= 44)

preprocessing_steps = [

    ('scaler', MinMaxScaler()),
    ('imputer', IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=0))
]

preprocessing_pipeline = Pipeline(preprocessing_steps)
preprocessing_pipeline.set_output(transform='pandas')

preprocessing_pipeline.fit(df_train)



df = preprocessing_pipeline.transform(df)

minmax_pipeline = preprocessing_pipeline.named_steps['scaler']

df_array = minmax_pipeline.inverse_transform(df)

df = pd.DataFrame(data=df_array, columns=df.columns, index=df.index).round().astype(int)
df[categorial_columns] = df[categorial_columns]
df[categorial_columns] = oe.inverse_transform(df[categorial_columns])

df = df.dropna()
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 20109 entries, 0 to 20325
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Net_Metrekare       20109 non-null  int64 
 1   Brüt_Metrekare      20109 non-null  int64 
 2   Oda_Sayısı          20109 non-null  int64 
 3   Bulunduğu_Kat       20109 non-null  object
 4   Eşya_Durumu         20109 non-null  object
 5   Binanın_Yaşı        20109 non-null  object
 6   Isıtma_Tipi         20109 non-null  object
 7   Fiyat               20109 non-null  int64 
 8   Şehir               20109 non-null  object
 9   Binanın_Kat_Sayısı  20109 non-null  int64 
 10  Kullanım_Durumu     20109 non-null  object
 11  Yatırıma_Uygunluk   20109 non-null  object
 12  Takas               20109 non-null  object
 13  Banyo_Sayısı        20109 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 2.3+ MB
None
