<a href="https://colab.research.google.com/github/OleksiiLatypov/House_Prices/blob/main/notebook43213f28ae.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'house-prices-advanced-regression-techniques:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F5407%2F868283%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240422%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240422T190223Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D90988575038c55f7fac1a7e73396e9723a67c4375e3455ee290c006fc28e84bed0d61ff2f2f1bc26e7980722fcd40b9cd39bde9817cb11531e17c64c6b499a37c33b09aedaecf1f1b4fbf00b04f9cb9914fdc2ce86d40f89f37525e3a31ec9cc749d43bc1ad633c5e1ef97ca10a5e7cfbfc502299c11043492b31a1873e44541b1b972141e063359da832908b601b24b57290e1b1eb0b7cb344250675f76f1e5bd5dcc0846bc2ec7be521a1d554bc5bd68c557a82370b6d4090580591a1568b93328eec95423b057c50f44ee702ce6ed29dba0c6551aa29e0742e6ec59fb62765ecd5dc02bcf8a485179bef7aa6bef13c19b0f4d8c3bef43d181c7ba15dc7423'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading house-prices-advanced-regression-techniques, 203809 bytes compressed
Downloaded and uncompressed: house-prices-advanced-regression-techniques
Data source import complete.


In [43]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/test.csv
/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/train.csv


In [44]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [45]:
train.shape

(1460, 81)

In [46]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [None]:
#train = train.drop('Id', axis=1)

In [47]:
to_drop = ['GarageYrBlt','1stFlrSF','GarageArea','TotRmsAbvGrd']
train = train.drop(to_drop, axis=1)
test = test.drop(to_drop, axis=1)

In [48]:
corr=train.select_dtypes('number').corr()

cat_features = train.select_dtypes(exclude='number')
amount = cat_features.isna().sum().sort_values(ascending=False)
correlation_matrix = corr.corr()
correlation_with_target = correlation_matrix['SalePrice'].abs().sort_values(ascending=False)
highly_correlated = [col for col in correlation_with_target.index if abs(correlation_with_target[col]) > 0.5]
highly_correlated

['SalePrice',
 'OverallQual',
 'GarageCars',
 'GrLivArea',
 'TotalBsmtSF',
 'YearBuilt',
 'FullBath',
 'MasVnrArea',
 'YearRemodAdd',
 'Fireplaces',
 'OpenPorchSF',
 'LotFrontage',
 'WoodDeckSF',
 'BsmtFinSF1']

In [49]:
missing = train.isna().sum().sort_values(ascending=False)
percentage = (missing / len(train)) * 100
missing_to_drop = [col for col in percentage.index if percentage[col]>45]
missing_to_drop

['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu']

In [50]:
train = train[highly_correlated]

In [51]:
highly_correlated_no_saleprice = highly_correlated.remove('SalePrice')

In [52]:
test = test[highly_correlated]

In [53]:
test.columns

Index(['OverallQual', 'GarageCars', 'GrLivArea', 'TotalBsmtSF', 'YearBuilt',
       'FullBath', 'MasVnrArea', 'YearRemodAdd', 'Fireplaces', 'OpenPorchSF',
       'LotFrontage', 'WoodDeckSF', 'BsmtFinSF1'],
      dtype='object')

In [None]:
train.drop(columns='Id',inplace= True)
mask1=train['TotalBsmtSF']<2050
mask2=train['TotalBsmtSF']>100
mask3=train['GrLivArea']<2800
mask4=train['GarageCars']<3.8
mask5=train['OverallQual']>1.8
DF=train[mask1&mask2 &mask3&mask4&mask5]

In [None]:
# x_cat = train.select_dtypes('object')

# # Step 1: Fill Missing Values with Mode
# x_cat_filled = x_cat.fillna(x_cat.mode().iloc[0])

# # Step 2: One-Hot Encoding
# x_cat_encoded = pd.get_dummies(x_cat_filled, dtype='int')

In [60]:
# Select numerical columns from the train dataset
x_num = train.select_dtypes(exclude='object')

# Calculate the mode for each numerical column
mode_values = x_num.mode().iloc[0]

# Fill missing values in each numerical column with its mode value
train = x_num.fillna(mode_values)


In [None]:
#train_final = pd.concat([x_num_filled, x_cat_encoded], axis=1)

In [68]:
X = train.drop('SalePrice', axis=1)
y = train['SalePrice']

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
X_train.head()

Unnamed: 0,OverallQual,GarageCars,GrLivArea,TotalBsmtSF,YearBuilt,FullBath,MasVnrArea,YearRemodAdd,Fireplaces,OpenPorchSF,LotFrontage,WoodDeckSF,BsmtFinSF1
254,5,1,1314,1314,1957,1,0.0,1957,0,0,70.0,250,922
1066,6,2,1571,799,1993,2,0.0,1994,1,40,59.0,0,0
638,5,0,796,796,1910,1,0.0,1950,0,0,67.0,328,0
799,5,1,1768,731,1937,1,252.0,1950,2,0,60.0,0,569
380,5,1,1691,1026,1924,2,0.0,1950,1,0,50.0,0,218


In [71]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
lr_prediction = lr.predict(X_test)


In [72]:
print('MAE: ', mean_absolute_error(y_test, lr_prediction))
print('MSE: ', mean_squared_error(y_test, lr_prediction))
print('R2 score', r2_score(y_test, lr_prediction))
print('RMSE:', np.sqrt(mean_squared_error(y_test, lr_prediction)))

MAE:  24288.720708275174
MSE:  1429340942.9110107
R2 score 0.8136531740238488
RMSE: 37806.62564830417


In [73]:
from sklearn.ensemble import RandomForestRegressor
clf_random_tree = RandomForestRegressor(max_depth=30, random_state=1,n_estimators=2000)
clf_random_tree.fit(X_train,y_train)

random_pred = clf_random_tree.predict(X_test)

In [74]:
print('MAE: ', mean_absolute_error(y_test, random_pred))
print('MSE: ', mean_squared_error(y_test, random_pred))
print('R2 score', r2_score(y_test, random_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, random_pred)))

MAE:  18643.697021531854
MSE:  891512890.4351099
R2 score 0.8837711896008075
RMSE: 29858.213115240334


In [None]:
!pip install catboost

In [108]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor


# Define CatBoostRegressor model
clf_catboost = CatBoostRegressor(iterations=1000,
                                 learning_rate=0.1,
                                 depth=4,

                                 l2_leaf_reg=1,
                                 loss_function='RMSE',
                                 random_seed=0)

# Train the model
clf_catboost.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

# Predict on test data
catboost_pred = clf_catboost.predict(X_test)


0:	learn: 72233.1648391	test: 82639.4611847	best: 82639.4611847 (0)	total: 3.26ms	remaining: 3.26s
100:	learn: 19685.5195371	test: 27276.9296107	best: 27276.9296107 (100)	total: 204ms	remaining: 1.81s
200:	learn: 15925.8736864	test: 26523.1740114	best: 26481.9843901 (180)	total: 348ms	remaining: 1.38s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 26468.77592
bestIteration = 209

Shrink model to first 210 iterations.


In [109]:
print('MAE: ', mean_absolute_error(y_test, catboost_pred))
print('MSE: ', mean_squared_error(y_test, catboost_pred))
print('R2 score', r2_score(y_test, catboost_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, catboost_pred)))

MAE:  17971.39667650037
MSE:  700596098.7629899
R2 score 0.9086614988934201
RMSE: 26468.775921129974


In [81]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=1000, learning_rate=0.01)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)


In [83]:
print('MAE: ', mean_absolute_error(y_test, xgb_pred))
print('MSE: ', mean_squared_error(y_test, xgb_pred))
print('R2 score', r2_score(y_test, xgb_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, xgb_pred)))

MAE:  18597.094499143837
MSE:  778741142.2231311
R2 score 0.8984735301748382
RMSE: 27905.93381743623


In [110]:
test.head()

Unnamed: 0,OverallQual,GarageCars,GrLivArea,TotalBsmtSF,YearBuilt,FullBath,MasVnrArea,YearRemodAdd,Fireplaces,OpenPorchSF,LotFrontage,WoodDeckSF,BsmtFinSF1
0,5,1.0,896,882.0,1961,1,0.0,1961,0,0,80.0,140,468.0
1,6,1.0,1329,1329.0,1958,1,108.0,1958,0,36,81.0,393,923.0
2,5,2.0,1629,928.0,1997,2,0.0,1998,1,34,74.0,212,791.0
3,6,2.0,1604,926.0,1998,2,20.0,1998,1,36,78.0,360,602.0
4,8,2.0,1280,1280.0,1992,2,0.0,1992,0,82,43.0,0,263.0


In [None]:
test_cat = test.select_dtypes('object')

# Step 1: Fill Missing Values with Mode
test_cat_filled = test_cat.fillna(test_cat.mode().iloc[0])

# Step 2: One-Hot Encoding
test_cat_encoded = pd.get_dummies(test_cat_filled, dtype='int')

In [111]:
test_num = test.select_dtypes(exclude='object')
test = test_num.fillna(test_num.median())

In [None]:
#test_final = pd.concat([test_num_filled, test_cat_encoded], axis=1)

In [113]:
test.head()

Unnamed: 0,OverallQual,GarageCars,GrLivArea,TotalBsmtSF,YearBuilt,FullBath,MasVnrArea,YearRemodAdd,Fireplaces,OpenPorchSF,LotFrontage,WoodDeckSF,BsmtFinSF1
0,5,1.0,896,882.0,1961,1,0.0,1961,0,0,80.0,140,468.0
1,6,1.0,1329,1329.0,1958,1,108.0,1958,0,36,81.0,393,923.0
2,5,2.0,1629,928.0,1997,2,0.0,1998,1,34,74.0,212,791.0
3,6,2.0,1604,926.0,1998,2,20.0,1998,1,36,78.0,360,602.0
4,8,2.0,1280,1280.0,1992,2,0.0,1992,0,82,43.0,0,263.0


In [115]:
test_pred = xgb.predict(test)

In [117]:
test_prediction_lgbm = pd.DataFrame(test_pred, columns=['SalePrice'])
test_prediction_lgbm

Unnamed: 0,SalePrice
0,119298.273438
1,155857.093750
2,179093.843750
3,190789.843750
4,197876.718750
...,...
1454,78277.671875
1455,86376.023438
1456,185014.031250
1457,117104.750000
