In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
test = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")
combined = pd.concat([train, test], axis=0)
train.shape, test.shape, combined.shape

In [None]:
train_X = train.copy()
train_y = train_X.pop('SalePrice')

In [None]:
combined.head()

## View can clean the data

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)

### Data missed status

In [None]:
combined.isna().mean(axis=0).sort_values(ascending=False).head(10)

In [None]:
usless_feats = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
combined_drop_missed = combined.drop(columns=usless_feats)
combined_drop_missed.shape

In [None]:
def splitTrainTestFromCombined (df):
    my_df = df.copy()
    train_X = my_df.iloc[:train.shape[0], :]
    train_y = train_X.pop('SalePrice')
    test_X = my_df.iloc[train.shape[0]:, :]
    return train_X, train_y, test_X

### Numerical features

In [None]:
num_feats = combined_drop_missed.select_dtypes(include=['number']).drop(['Id', 'SalePrice'], axis=1)
num_feats.columns

In [None]:
row_num = len(num_feats.columns)
col_num = 5
fig = plt.figure(figsize=(30,row_num * col_num))
for index,col in enumerate(num_feats.columns):
    plt.subplot(row_num, col_num, index + 1)
    sns.distplot(combined_drop_missed[col].dropna(), kde=False)
plt.tight_layout()

In [None]:
num_only_one_value_feats = ['BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 'KitchenAbvGr', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
combined_drop_less_num = combined_drop_missed.drop(columns=num_only_one_value_feats)
combined_drop_less_num.shape

## View numerical featurs regression 

In [None]:
reset_num_feats = list(set(list(num_feats.columns)) - set(num_only_one_value_feats))

train_X, train_y, _ = splitTrainTestFromCombined(combined_drop_less_num)

row_num = len(reset_num_feats)
col_num = 5
fig = plt.figure(figsize=(30,row_num * col_num))
for index,col in enumerate(reset_num_feats):
    plt.subplot(row_num, col_num, index + 1)
    sns.regplot(data=train_X.join(train_y), x=col, y='SalePrice')
plt.tight_layout()

In [None]:
no_regression_feats = ['YrSold', 'MoSold']
combined_drop_noreg_num = combined_drop_less_num.drop(columns=no_regression_feats)
combined_drop_noreg_num.shape

### Category features

In [None]:
cat_feats = combined_drop_missed.select_dtypes(exclude=['number'])
cat_feats.columns

In [None]:
row_num = len(cat_feats.columns)
col_num = 5
fig = plt.figure(figsize=(30,row_num * col_num))
for index,col in enumerate(cat_feats.columns):
    plt.subplot(row_num, col_num, index + 1)
    sns.countplot(data=combined_drop_missed, x=col)
plt.tight_layout()

In [None]:
cat_only_one_value_feats = ['Street', 'Utilities', 'LandSlope', 'Condition2', 'RoofMatl', 'Heating', 'GarageCond']
combined_drop_less_cat = combined_drop_noreg_num.drop(columns=cat_only_one_value_feats)
combined_drop_less_cat.shape

### View correlation

In [None]:
num_feats = combined_drop_less_cat.select_dtypes(include=['number']).drop(['Id'], axis=1)

plt.figure(figsize=(14,12))
correlation = num_feats.corr()
sns.heatmap(correlation, mask = correlation < 0.8, linewidth=0.5, cmap='Reds')

In [None]:
combined_drop_less_cat.iloc[:train.shape[0],:]

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
import eli5
from eli5.sklearn import PermutationImportance

def displayFeatureWeight (combined_df):
    my_df = combined_df.copy()
    train_X, train_y, _ = splitTrainTestFromCombined(combined_df)
    for col in train_X.select_dtypes(exclude='number').columns:
        train_X[col], _ = train_X[col].factorize()

    t_X, v_X, t_y, v_y = train_test_split(
        train_X,
        train_y,
        random_state=21
    )

    perm = PermutationImportance(LGBMRegressor().fit(t_X, t_y), random_state=1).fit(v_X, v_y)
    return eli5.show_weights(perm, feature_names = train_X.columns.tolist(), top=15)

displayFeatureWeight(combined_drop_less_cat)

In [None]:
high_correlation_feats = ['1stFlrSF', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea']
combined_droped = combined_drop_less_cat.drop(columns=high_correlation_feats)
combined_droped.shape

### Remove outlier data

In [None]:
topRelatedFeatures = ['OverallQual', 'GrLivArea', 'BsmtFinSF1', 'TotalBsmtSF', 'YearBuilt', 'GarageCars', 'OverallCond', 'OpenPorchSF', 'Fireplaces', 'LotArea', 'Neighborhood', 'YearRemodAdd']
train_X, train_y, _ = splitTrainTestFromCombined(combined_droped)

row_num = len(topRelatedFeatures)
col_num = 5
plt.figure(figsize=(30, row_num * col_num))
           
for index, col in enumerate(topRelatedFeatures):
    plt.subplot(row_num, col_num, index + 1)
    if (train_X[col].dtype == 'object'):
        sns.scatterplot(data=train_X.join(train_y), x=col, y="SalePrice") 
    else:
        sns.regplot(data=train_X.join(train_y), x=col, y="SalePrice")   
plt.tight_layout()      

In [None]:
def dropByCondition (df, condition):
    return df.drop(df[condition].index)

In [None]:
train_X, train_y, test_X = splitTrainTestFromCombined(combined_droped)
train = train_X.join(train_y)

In [None]:
train_truncated = train.copy()

In [None]:
train_truncated = dropByCondition(train_truncated, (train_truncated['OverallQual'] == 10) & (train_truncated['SalePrice'] < 200000))
train_truncated.shape

In [None]:
train_truncated = dropByCondition(train_truncated, (train_truncated['GrLivArea'] > 4000) & (train_truncated['SalePrice'] < 200000))
train_truncated.shape

In [None]:
train_truncated = dropByCondition(train_truncated, (train_truncated['BsmtFinSF1'] > 0) & (train_truncated['BsmtFinSF1'] < 1000) & (train_truncated['SalePrice'] > 600000))
train_truncated.shape

In [None]:
train_truncated = dropByCondition(train_truncated, train_truncated['SalePrice'] > 600000)
train_truncated.shape

In [None]:
train_truncated = dropByCondition(train_truncated, (train_truncated['LotArea']> 150000) & (train_truncated['SalePrice'] < 400000))
train_truncated.shape

## Add new features

In [None]:
def addHasBsmtFinSF1Feat(df):
    my_df = df.copy()
    my_df['HasBsmtFinSF1'] = my_df['BsmtFinSF1'] != 0
    return my_df

In [None]:
def addHasBasementFeat(df):
    my_df = df.copy()
    my_df['HasBasement'] = my_df['TotalBsmtSF'] != 0
    return my_df

In [None]:
def addHasOpenPorchSFFeat(df):
    my_df = df.copy()
    my_df['HasOpenPorchSF'] = my_df['OpenPorchSF'] != 0
    return my_df

In [None]:
def addNewFeats(df):
    my_df = df.copy()
    my_df = addHasBsmtFinSF1Feat(my_df)
    my_df = addHasBasementFeat(my_df)
    my_df = addHasOpenPorchSFFeat(my_df)
    return my_df

In [None]:
train_with_new_feats = addNewFeats(train_truncated)
train_with_new_feats.shape

In [None]:
train_X = train_with_new_feats.copy()
train_y = train_X.pop('SalePrice')
train_X.shape

## Pipeline for encode and missing values

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

In [None]:
from sklearn.compose import ColumnTransformer

num_feats = train_X.select_dtypes(include='number').columns
cat_feats = train_X.select_dtypes(exclude='number').columns

full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_feats),
        ("categorical", categorical_pipeline, cat_feats),
    ]
)

ready_train_X = full_processor.fit_transform(train_X)
ready_train_X.shape

In [None]:
ready_train_y = SimpleImputer(strategy="most_frequent").fit_transform(train_y.values.reshape(-1, 1).astype(int))

## Quick testing with the model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from lightgbm import LGBMRegressor

X_split_train, X_split_test, y_split_train, y_split_test = train_test_split(
    ready_train_X,
    ready_train_y,
    random_state=21
)

clf = LGBMRegressor()
clf.fit(X_split_train, y_split_train.ravel())
preds = clf.predict(X_split_test)
mean_squared_log_error(y_split_test, preds)

## GridSearchCV

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

param_grid = {
    'max_depth' : [8, 10, 12],
    'learning_rate' : [0.01, 0.1],
    'n_estimators' : [500, 1000],
    'feature_fraction' : [0.6, 0.8],
    'min_child_samples' : [15, 20]
}

lgb_model = LGBMRegressor()

grid_cv = GridSearchCV(
    lgb_model,
    param_grid,
    n_jobs=-1,
    cv=5,
    refit=True
)

_ = grid_cv.fit(ready_train_X, ready_train_y.ravel())

In [None]:
grid_cv.best_score_, grid_cv.best_params_

## Setup model and train

In [None]:
clf = LGBMRegressor(
    **grid_cv.best_params_
)

clf.fit(ready_train_X, ready_train_y.ravel())

## Submit

In [None]:
transformed_test = addNewFeats(test_X)
ready_test = full_processor.transform(transformed_test)

predection = clf.predict(ready_test)
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predection})
my_submission

In [None]:
my_submission.to_csv('submission.csv', index=False)