# House Price Prediction

---

# Installs

In [None]:
!pip install -q autoviz
!pip install -q -U --pre pycaret

![image.png](attachment:1c80e9ca-991b-4e76-942d-e42f9d425dee.png)

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from autoviz.classify_method import data_cleaning_suggestions ,data_suggestions
from pycaret  import regression
from sklearn.model_selection import cross_val_score

In [None]:
train = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
test  = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

In [None]:
train.head()

In [None]:
test.head()

# EDA

In [None]:
data_cleaning_suggestions(train)

In [None]:
data_cleaning_suggestions(test)

In [None]:
# check which columns have missing values
null_cols = train.columns[train.isnull().any()]
null_cols

In [None]:
# fill missing values with mean value for each column
for col in null_cols:
    if train[col].dtype != object:
        train[col].fillna(train[col].mean(), inplace=True)

In [None]:
# check which columns have missing values
null_cols = train.columns[train.isnull().any()]
null_cols

In [None]:
train.dropna(axis = 1, inplace = True)

In [None]:
data_cleaning_suggestions(train)

In [None]:
def detect_outliers(data):
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(train)

In [None]:
train.columns

In [None]:
# check which columns have missing values
null_cols = test.columns[test.isnull().any()]
null_cols

In [None]:
# fill missing values with mean value for each column
for col in null_cols:
    if test[col].dtype != object:
        test[col].fillna(test[col].mean(), inplace=True)

In [None]:
# check which columns have missing values
null_cols = test.columns[test.isnull().any()]
null_cols

In [None]:
test.dropna(axis = 1, inplace = True)

In [None]:
train.shape

In [None]:
test.shape

# Correlation

In [None]:
plt.figure(figsize=(50,30))
sns.heatmap(train.corr(),annot=True)

In [None]:
X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']

# Regressor Model Comparision

In [None]:
regression.setup(X,target=y ,session_id=42)

In [None]:
regression.compare_models()

In [None]:
train.dtypes

# CatBoost Model

In [None]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from pycaret.regression import *

In [None]:
model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6)

In [None]:
obj_cols = train.select_dtypes(include=['object']).columns
obj_cols

In [None]:
model.fit(get_config('X'), get_config('y'), cat_features=['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating',
       'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive',
       'SaleType', 'SaleCondition'])

In [None]:
data = train.sample(frac=0.9, random_state=786).reset_index(drop=True)
data_unseen = train.drop(data.index).reset_index(drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

In [None]:
cat = create_model('catboost')

In [None]:
#trained model object is stored in the variable 'dt'. 
print(cat)

In [None]:
plot_model(cat)

In [None]:
plot_model(cat, plot = 'error')

In [None]:
plot_model(cat, plot='feature')

In [None]:
evaluate_model(cat)

# Predictions

In [None]:
predict_model(cat)

# Predictions on Unseen Data

In [None]:
unseen_predictions = predict_model(cat, data=data_unseen)
unseen_predictions.head()

In [None]:
unseen_predictions.shape

In [None]:
train.shape

In [None]:
test = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

In [None]:
test.shape

In [None]:
train_cols = set(train.columns)
test_cols = set(test.columns)

# Find columns in test that are not in train
extra_cols = test_cols - train_cols

# Remove extra columns from test
test = test.drop(extra_cols, axis=1)

In [None]:
test.shape

In [None]:
train.shape

# Predictions on Test Data

In [None]:
testpreds = predict_model(cat, data=test)
testpreds

In [None]:
pred_cols = set(testpreds.columns)
test_cols = set(test.columns)

# Find columns in test that are not in train
extra_cols = pred_cols - test_cols
extra_cols

# Submission

In [None]:
sub = pd.read_csv('/kaggle/input/home-data-for-ml-course/sample_submission.csv')
sub.shape

In [None]:
sub.drop('SalePrice', axis = 1, inplace = True)

In [None]:
# Separate column from source dataset
column_to_move = testpreds.pop('prediction_label')

# Add column to destination dataset
sub['SalePrice'] = column_to_move

In [None]:
sub

In [None]:
sub.to_csv('submission.csv' ,index = False)

In [None]:
sub = pd.read_csv('/kaggle/working/submission.csv')
sub

# Thank You - Do Upvote

---
