In [375]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra`
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [376]:
import warnings 
warnings.filterwarnings("ignore")


## 1. Loading training data

In [377]:
# load dataset
dataset=pd.read_csv(r'../input/home-data-for-ml-course/train.csv')
pd.set_option('max_columns',200)

In [378]:
# copy dataset into train
train=dataset.copy()

In [379]:
train.head()

## 2. EDA 

## 2. a) Exploring features

In [380]:
train.describe().T

In [381]:
train.shape

In [382]:
train.isnull().sum().any()

In [383]:
# selecting num features 
num_features=train.select_dtypes(('int','float')).columns
num_features

In [384]:
# selecting categorical features
cat_features=train.select_dtypes(('object','bool')).columns
cat_features

In [385]:
train[num_features].isnull().sum()

In [386]:
train[num_features].nunique()

## 2. b) Plotting features

In [387]:
# plotting num features with less than 10 unique values with countplot
for feature in num_features:
    if train[feature].nunique()<10:
        plt.title(f'Countplot of {feature}')
        sns.countplot(y=train[feature])
        plt.show()

In [388]:
# plotting num features with 10 or more unique values with histplot 
for feature in num_features:
    if train[feature].nunique()>=10:
        plt.title(f'Distribution of  {feature}')
        plt.xticks(rotation=90)
        plt.xlabel(feature)
        sns.histplot(x=train[feature],kde=True)
        plt.show()

In [389]:
train[cat_features].isnull().sum() 

In [390]:
train[cat_features].nunique()

In [391]:
# plotting categorical features using countplot
for feature in cat_features:
    plt.title(f'Countplot of {feature}')
    sns.countplot(y=train[feature])
    plt.show()

In [392]:
# dropping columns with missing values more than 50% with and Id columns
train=train.drop(['Id' ,'Alley' ,'PoolQC' ,'Fence' ,'MiscFeature'],axis=1)

In [393]:
train.head()

## 3. Train - Test Split 


In [394]:
# separating dependent and independent features
target=train.SalePrice
train=train.drop('SalePrice',axis=1)

In [395]:
# train_test_split
from sklearn.model_selection import train_test_split

full_train_X,full_val_X,train_y,val_y=train_test_split(train,target,random_state=0,test_size=.2)

In [396]:
# separating numeric and categorical features
cat_cols=[col for col in full_train_X.columns if full_train_X[col].dtype=='object']
num_cols=[col for col in full_train_X.columns if full_train_X[col].dtype==('int','float')]

# combining both feature types
my_cols=cat_cols+num_cols


In [397]:
# making new dfs with combined cols
train_X=full_train_X[my_cols].copy()
val_X=full_val_X[my_cols].copy() 



## 4. Pipelines and ColumnTransformer

In [398]:
# importing libararies

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# making pipeline for imputing, scaling & reducing features with PCA
num_transformers=Pipeline(steps=[
    ('num_imputer',SimpleImputer(strategy='median')),
    ('num_scaler',StandardScaler()),
    ('pca',PCA(n_components=7))
])



# making pipeline for imputing & encoding categorical features 
cat_transformer= Pipeline(steps=[
    ('cat_imputer',SimpleImputer(strategy='most_frequent')),
    ('cat_encoder',OneHotEncoder(handle_unknown='ignore'))
])


# doing preprocessing on features using ColumnTransformer 
preprocessor = ColumnTransformer(
    transformers=[
        ('num',num_transformers,num_cols),
        ('cat',cat_transformer,cat_cols)
    ])




## 5. Model building using Pipelines

In [399]:
# making model with XGBoost

from xgboost import XGBRegressor

xgb_model=XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0.1, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.3,
             max_delta_step=0, max_depth=12, min_child_weight=4,
             monotone_constraints='()', n_estimators=300, n_jobs=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [400]:
# making final pipeline with XGBoost model

xgb_pipe=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',xgb_model)
])

In [401]:
xgb_pipe.fit(train_X,train_y) 

In [402]:
xgb_pred=xgb_pipe.predict(val_X)

In [403]:
from sklearn.metrics import r2_score, mean_absolute_error

mae_score=mean_absolute_error(xgb_pred,val_y)
R2_score=r2_score(xgb_pred,val_y)

print(f'XGB MAE : {mae_score}')
print(f'XGB R2 Score: {R2_score}')

In [404]:
# making model with Random Forest

from sklearn.ensemble import RandomForestRegressor

rfr_model=RandomForestRegressor(n_estimators=300,min_samples_leaf=3,min_samples_split=2,random_state=0,max_depth=12)

In [405]:
# making final pipeline with RandomForest
rfr_pipe=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',rfr_model)
])

In [406]:
rfr_pipe.fit(train_X,train_y)

In [407]:
rfr_pred=rfr_pipe.predict(val_X)

In [408]:
rfr_mae_score=mean_absolute_error(rfr_pred,val_y)
rfr_r2_score=r2_score(rfr_pred,val_y)

print(f'RFR R2 Score: {rfr_r2_score}')
print(f'RFR MAE Score : {rfr_mae_score}')

In [409]:
# making model with LightGBM

import lightgbm as lgb

lgb_model=lgb.LGBMRegressor(n_estimators=500,learning_rate=.35 ,random_state=0)

In [410]:
# making final pipeline with LightGBM
lgb_pipe=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',lgb_model)
])

In [411]:
lgb_pipe.fit(train_X,train_y)

In [412]:
lgb_pred=lgb_pipe.predict(val_X)

In [413]:
lgb_mae_score=mean_absolute_error(lgb_pred,val_y)
lgb_r2_score=r2_score(lgb_pred,val_y)

print(f'LGB R2 Score: {lgb_r2_score}')
print(f'LGB MAE Score : {lgb_mae_score}')

## 6. Plotting predicted SalePrice of all models

In [414]:
# plotting predicted saleprice with XGBoost

plt.title("Distibution of Predicted SalePrice using XBGBoost")
sns.histplot(xgb_pred,kde=True)


In [415]:
 # plotting predicted saleprice with RandomForest

plt.title("Distibution of Predicted SalePrice using Random Forest")
sns.histplot(rfr_pred,kde=True)

In [416]:
# plotting predicted saleprice with LightGBM

plt.title("Distibution of Predicted SalePrice using LightGBM Forest")
sns.histplot(lgb_pred,kde=True)

## 7. Loading test dataset

In [417]:
# loading test data
test_data=pd.read_csv(r'../input/home-data-for-ml-course/test.csv')

In [418]:
# making copy of test data as test
test=test_data.copy()

## 8. EDA on Test data

In [419]:
test.describe().T

In [420]:
test.nunique()

In [421]:
# making list of numeric and categorical features
test_num_features=test.select_dtypes(('int','float')).columns
test_cat_features=test.select_dtypes('object').columns 



## 9. Plotting test data features

In [422]:
# plotting numeric features with less than 10 unique values with countplot
for feature in test_num_features:
    if test[feature].nunique()<10:
        plt.title(f'Countplot of {feature}')
        plt.yticks(rotation=90)
        sns.countplot(y=test[feature]) 
        plt.show()


In [423]:
# plotting numeric features with 10 or more unique values with histogram
for feature in test_num_features:
    if test[feature].nunique()>=10:
        plt.title(f'Distribution of  {feature}')
        plt.xticks(rotation=90)
        plt.xlabel(feature)
        sns.histplot(x=test[feature],kde=True) 
        plt.show()

In [424]:
# plotting categorical features with countplot
for feature in test_cat_features:
    plt.title(f'Countplot of {feature}')
    sns.countplot(y=test[feature])
    plt.show() 

In [425]:
test[test_num_features].isnull().sum()

In [426]:
test[test_cat_features].isna().sum()

In [427]:
# dropping columns with more than 50% nan values and Id column
test=test.drop(['Id' ,'Alley' ,'PoolQC' ,'Fence' ,'MiscFeature'],axis=1) 


## 10. Final model building on test data

In [428]:
# using xgb_pipe to fit whole training dataset
rfr_pipe.fit(train,target)

In [429]:
# using xgb_pipe to predict SalePrice for test data
test_pred=rfr_pipe.predict(test) 

In [430]:
# plotting predicted SalePrice for test data
plt.title('Distibution of Test SalePrice Prediction')
sns.histplot(test_pred,kde=True)

## 11. Submission

In [431]:
# creating new df for submission
submission=pd.DataFrame({'Id':test_data.Id,'SalePrice':test_pred})
submission

In [432]:
# converting df into csv file
submission.to_csv('submission.csv',index=False)