In [1]:
#Kütüphane import etme
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
#Dataset import etme ve boşluklarından ayırma
train_data=pd.read_csv("./train.csv",na_values=["NA", "N/A", "None", "-", " "])
test_data=pd.read_csv("./test.csv",na_values=["NA", "N/A", "None", "-", " "])
submission_data=pd.read_csv("./submission.csv")
train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()

In [3]:
print(train_data.head())
print(train_data.info())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [4]:
# "LotFrontage" sütunundaki sayısal olmayan değerleri NaN olarak değiştirdim
train_data['LotFrontage'] = pd.to_numeric(train_data['LotFrontage'], errors='coerce')
test_data['LotFrontage'] = pd.to_numeric(test_data['LotFrontage'], errors='coerce')

imputer_mean = SimpleImputer(strategy="mean")  # Eksik değerleri ortalama ile doldur
imputer_most_frequent=SimpleImputer(strategy="most_frequent") #Eksik değerleri en çok tekrar edenle doldur
imputer_constant=SimpleImputer(strategy="constant",fill_value="Unknown") #Eksik değerleri Unknown ile doldur


# Imputer işlemi
train_data['LotFrontage'] = imputer_mean.fit_transform(train_data[['LotFrontage']])
test_data['LotFrontage'] = imputer_mean.transform(test_data[['LotFrontage']])

train_data['MasVnrArea'] = imputer_most_frequent.fit_transform(train_data[['MasVnrArea']])
test_data['MasVnrArea'] = imputer_most_frequent.transform(test_data[['MasVnrArea']])

train_data['FireplaceQu'] = imputer_constant.fit_transform(train_data[['FireplaceQu']]).flatten()
test_data['FireplaceQu'] = imputer_constant.fit_transform(test_data[['FireplaceQu']]).flatten()

train_data['BsmtQual'] = imputer_most_frequent.fit_transform(train_data[['BsmtQual']]).flatten()
test_data['BsmtQual'] = imputer_most_frequent.transform(test_data[['BsmtQual']]).flatten()

train_data['BsmtCond'] = imputer_most_frequent.fit_transform(train_data[['BsmtCond']]).flatten()
test_data['BsmtCond'] =imputer_most_frequent.transform(test_data[['BsmtCond']]).flatten()

train_data['BsmtExposure'] = imputer_most_frequent.fit_transform(train_data[['BsmtExposure']]).flatten()
test_data['BsmtExposure'] = imputer_most_frequent.transform(test_data[['BsmtExposure']]).flatten()

train_data['BsmtFinType1'] = imputer_most_frequent.fit_transform(train_data[['BsmtFinType1']]).flatten()
test_data['BsmtFinType1'] = imputer_most_frequent.transform(test_data[['BsmtFinType1']]).flatten()

train_data['BsmtFinType2'] = imputer_most_frequent.fit_transform(train_data[['BsmtFinType2']]).flatten()
test_data['BsmtFinType2'] = imputer_most_frequent.transform(test_data[['BsmtFinType2']]).flatten()

train_data['Electrical'] = imputer_most_frequent.fit_transform(train_data[['Electrical']]).flatten()
test_data['Electrical'] = imputer_most_frequent.transform(test_data[['Electrical']]).flatten()

train_data['GarageQual'] = imputer_most_frequent.fit_transform(train_data[['GarageQual']]).flatten()
test_data['GarageQual'] = imputer_most_frequent.transform(test_data[['GarageQual']]).flatten()

train_data['GarageCond'] = imputer_most_frequent.fit_transform(train_data[['GarageCond']]).flatten()
test_data['GarageCond'] = imputer_most_frequent.transform(test_data[['GarageCond']]).flatten()

train_data['GarageType'] = imputer_most_frequent.fit_transform(train_data[['GarageType']]).flatten()
test_data['GarageType'] = imputer_most_frequent.transform(test_data[['GarageType']]).flatten()

train_data['GarageYrBlt'] = imputer_most_frequent.fit_transform(train_data[['GarageYrBlt']]).flatten()
test_data['GarageYrBlt'] = imputer_most_frequent.transform(test_data[['GarageYrBlt']]).flatten()

train_data['GarageFinish'] = imputer_most_frequent.fit_transform(train_data[['GarageFinish']]).flatten()
test_data['GarageFinish'] = imputer_most_frequent.transform(test_data[['GarageFinish']]).flatten()

columns_to_impute = [
    'MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 
    'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
    'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 
    'GarageCars', 'GarageArea', 'SaleType' ]
test_data[columns_to_impute] = imputer_most_frequent.fit_transform(test_data[columns_to_impute])

In [5]:
#Çok fazla Na değerleri olan sütünları atma işlemi
train_data = train_data.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature','MasVnrType'], axis=1)
test_data = test_data.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature','MasVnrType'], axis=1)

In [6]:
print(train_data)

        Id  MSSubClass MSZoning  LotFrontage  LotArea Street LotShape  \
0        1          60       RL         65.0     8450   Pave      Reg   
1        2          20       RL         80.0     9600   Pave      Reg   
2        3          60       RL         68.0    11250   Pave      IR1   
3        4          70       RL         60.0     9550   Pave      IR1   
4        5          60       RL         84.0    14260   Pave      IR1   
...    ...         ...      ...          ...      ...    ...      ...   
1455  1456          60       RL         62.0     7917   Pave      Reg   
1456  1457          20       RL         85.0    13175   Pave      Reg   
1457  1458          70       RL         66.0     9042   Pave      Reg   
1458  1459          20       RL         68.0     9717   Pave      Reg   
1459  1460          20       RL         75.0     9937   Pave      Reg   

     LandContour Utilities LotConfig  ... EnclosedPorch 3SsnPorch ScreenPorch  \
0            Lvl    AllPub    Inside  ... 

In [7]:
print(test_data)

        Id  MSSubClass MSZoning  LotFrontage  LotArea Street LotShape  \
0     1461          20       RH         80.0    11622   Pave      Reg   
1     1462          20       RL         81.0    14267   Pave      IR1   
2     1463          60       RL         74.0    13830   Pave      IR1   
3     1464          60       RL         78.0     9978   Pave      IR1   
4     1465         120       RL         43.0     5005   Pave      IR1   
...    ...         ...      ...          ...      ...    ...      ...   
1454  2915         160       RM         21.0     1936   Pave      Reg   
1455  2916         160       RM         21.0     1894   Pave      Reg   
1456  2917          20       RL        160.0    20000   Pave      Reg   
1457  2918          85       RL         62.0    10441   Pave      Reg   
1458  2919          60       RL         74.0     9627   Pave      Reg   

     LandContour Utilities LotConfig  ... OpenPorchSF EnclosedPorch 3SsnPorch  \
0            Lvl    AllPub    Inside  ... 

In [8]:
#Categorical olan datalar için Label Encoder işlemini uyguladım
categorical_columns = train_data.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()
for col in categorical_columns:
    train_data[col] = label_encoder.fit_transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])
print("Encoded train_data sample:\n", train_data.head())
print("\nEncoded test_data sample:\n", test_data.head())

Encoded train_data sample:
    Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  LotShape  \
0   1          60         3         65.0     8450       1         3   
1   2          20         3         80.0     9600       1         3   
2   3          60         3         68.0    11250       1         0   
3   4          70         3         60.0     9550       1         0   
4   5          60         3         84.0    14260       1         0   

   LandContour  Utilities  LotConfig  ...  EnclosedPorch  3SsnPorch  \
0            3          0          4  ...              0          0   
1            3          0          2  ...              0          0   
2            3          0          4  ...              0          0   
3            3          0          0  ...            272          0   
4            3          0          2  ...              0          0   

   ScreenPorch  PoolArea  MiscVal  MoSold  YrSold  SaleType  SaleCondition  \
0            0         0        0       

In [9]:
#X_train, X_test, y_train , y_test kısımlarını oluşturdum

#Train.csv dosyasından X_train ve y_train olarak ayırdım
X_train=train_data.iloc[:,:-1]
y_train=train_data.iloc[:,-1]

#Test.csv dosyasından X_test,Submission.csv dosyasından y_test oluşturdum
X_test=test_data.iloc[:,:]
y_test=submission_data.iloc[:,-1]


In [10]:
#Lineer regresyon uyguladım
regressor=LinearRegression()
regressor.fit(X_train,y_train)

In [11]:
y_pred=regressor.predict(X_test)

#Modelin ne kadar tutarlı olduğunu ölçmek için aşağıdaki test işlemlerini yaptım
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
r2= r2_score(y_test,y_pred)
print(f"R2 score: {r2}")

# Test veri setinde tahmin
test_predictions = regressor.predict(test_data)
output = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_predictions})
print(output)


Mean Squared Error: 526757381.8216281
R2 score: 0.9134990272036315
        Id      SalePrice
0     1461  103707.384710
1     1462  153021.050781
2     1463  164926.834041
3     1464  185639.369305
4     1465  187631.967543
...    ...            ...
1454  2915   64155.470036
1455  2916   54604.026501
1456  2917  139694.673103
1457  2918  112745.507320
1458  2919  244291.248148

[1459 rows x 2 columns]
