# House Price Prediction

---

# Installs

In [None]:
!pip install -q autoviz
!pip install -q -U --pre pycaret

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from autoviz.classify_method import data_cleaning_suggestions ,data_suggestions
from pycaret  import regression
from sklearn.model_selection import cross_val_score

In [None]:
train = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
test  = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

In [None]:
train.head()

In [None]:
test.head()

# EDA

In [None]:
data_cleaning_suggestions(train)

In [None]:
data_cleaning_suggestions(test)

In [None]:
# check which columns have missing values
null_cols = train.columns[train.isnull().any()]
null_cols

In [None]:
# fill missing values with mean value for each column
for col in null_cols:
    if train[col].dtype != object:
        train[col].fillna(train[col].mean(), inplace=True)

In [None]:
# check which columns have missing values
null_cols = train.columns[train.isnull().any()]
null_cols

In [None]:
# get list of categorical columns
cat_cols = train.select_dtypes(include=['object']).columns.tolist()
cat_cols

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [None]:
for col in cat_cols:
    # Replace missing values with "missing"
    train[col] = train[col].fillna("missing")
    
    # Encode values in training set
    le.fit(train[col])
    train[col] = le.transform(train[col])

In [None]:
data_cleaning_suggestions(train)

In [None]:
def detect_outliers(data):
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(train)

In [None]:
# check which columns have missing values
null_cols = test.columns[test.isnull().any()]
null_cols

In [None]:
# fill missing values with mean value for each column
for col in null_cols:
    if test[col].dtype != object:
        test[col].fillna(test[col].mean(), inplace=True)

In [None]:
# check which columns have missing values
null_cols = test.columns[test.isnull().any()]
null_cols

In [None]:
for col in cat_cols:
    # Replace missing values with "missing"
    test[col] = test[col].fillna("missing")

    # Encode values in training set
    le.fit(test[col])
    test[col] = le.transform(test[col])

In [None]:
# check which columns have missing values
null_cols = test.columns[test.isnull().any()]
null_cols

In [None]:
data_cleaning_suggestions(test)

In [None]:
train.shape

In [None]:
test.shape

# Correlation

In [None]:
plt.figure(figsize=(50,30))
sns.heatmap(train.corr(),annot=True)

In [None]:
X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']

# Regressor Model Comparision

In [None]:
from pycaret.regression import *

In [None]:
regression.setup(X,target=y ,session_id=42)

In [None]:
regression.compare_models()

# Ensemble Model

In [None]:
data = train.sample(frac=0.9, random_state=786).reset_index(drop=True)
data_unseen = train.drop(data.index).reset_index(drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

In [None]:
# Define the base models
cb = create_model('catboost')
lgbm = create_model('lightgbm')
gbr = create_model('gbr')
rf = create_model('rf')
et = create_model('et')

In [None]:
# Create ensemble model
from sklearn.ensemble import VotingRegressor

ensemble_model = VotingRegressor(estimators=[('lgbm', lgbm), ('gbr', gbr), ('rf', rf), ('et', et), ('cb', cb)], weights=[1, 1, 1, 1, 1])

In [None]:
from sklearn.model_selection import cross_val_score
# Evaluate the ensemble model using cross-validation
scores = cross_val_score(ensemble_model, X, y, cv=20)

In [None]:
# Train the ensemble model on the full dataset
ensemble_model.fit(X, y)

In [None]:
preds = predict_model(ensemble_model)

In [None]:
preds

In [None]:
plot_model(ensemble_model)

In [None]:
plot_model(ensemble_model, plot = 'error')

In [None]:
test

# Predictions

In [None]:
testpreds = predict_model(ensemble_model, data = test)
testpreds

# Submission

In [None]:
sub = pd.read_csv('/kaggle/input/home-data-for-ml-course/sample_submission.csv')
sub.shape

In [None]:
sub.drop('SalePrice', axis = 1, inplace = True)
sub

In [None]:
# Separate column from source dataset
column_to_move = testpreds.pop('prediction_label')
sub['SalePrice'] = column_to_move
sub

In [None]:
sub.to_csv('submission.csv' ,index = False)
sub = pd.read_csv('/kaggle/working/submission.csv')
sub

# Thank You - Do Upvote

---
