<h2 style="font-weight: bold">House Prices Competition</h2>

<h4>This is my second published notebook on Kaggle, So yeah no wonder it's about the House Prices Competition 😄😄<br><br>I will be doing a simple then advanced EDA, Data Visualization and Pre-Processing. I also will test different approaches and regression techniques to improve my score.<br></h4>

* <h5 style="font-weight: 700">Your feedback is very welcome</h5>
* <h5 style="font-weight: 700">If you find this notebook useful, please don't forget to upvote it!</h5>


In [None]:
# Required packages
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
import lightgbm as lgb
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train.head()

#  **Exploratory Data Analysis**

In [None]:
# Getting to know data
print(f"Training Set :\n Number of rows : {train.shape[0]}, Number of Columns : {train.shape[1]}")
print(f"Test Set :\n Number of rows : {test.shape[0]}, Number of Columns : {test.shape[1]}")

In [None]:
# extract numeric data
num_cols = train.loc[:,train.dtypes != 'object'].drop(['Id'], axis=1).columns
num_train = train[num_cols]
# extract categorical data
cat_cols = train.loc[:,train.dtypes == 'object'].columns
cat_train = train[cat_cols]

print("Total Numerical Cols : ", len(num_cols))
print("Total Categorical Cols : ", len(cat_cols))

In [None]:
# summary of numerical variable
train.describe()

In [None]:
# summary of categorial variable
train.info()

In [None]:
# let's clean visualizations :)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Let's take a look at the distribution of the SalePrice 
sns.distplot(train['SalePrice'] , fit=norm);
(mu, sigma) = norm.fit(train['SalePrice'])
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

In [None]:
price = np.log1p(train["SalePrice"])
 
sns.distplot(price , fit=norm);
(mu, sigma) = norm.fit(price)
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

In [None]:
# common sales' years
plt.figure(figsize=(18,10))
plots = train["YrSold"].value_counts().plot(kind="bar")
for bar in plots.patches:
    plots.annotate(format(bar.get_height(), '.0f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),
                   textcoords='offset points')
plt.title("Houses Sold over the Years")
plt.ylabel("Number")
plt.show()

In [None]:
# common purchases' types
plt.figure(figsize=(18,10))
plots = train["SaleType"].value_counts().plot(kind="bar")
for bar in plots.patches:
    plots.annotate(format(bar.get_height(), '.0f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),
                   textcoords='offset points')
plt.title("Most purchased Sale Type")
plt.ylabel("Frequency")
plt.xlabel("Sale Type")
plt.show()

In [None]:
### Feature importances

In [None]:
# fill nan with "NULL"
tmp_df = train.copy()
tmp_df[cat_cols] = tmp_df[cat_cols].fillna('NULL')
# label encoding
for col in cat_cols:
    le = LabelEncoder()
    le.fit(tmp_df[col])
    tmp_df[col] = le.transform(tmp_df[col])
# train data
X_train = tmp_df.drop(['SalePrice', 'Id'], axis=1)
y_train = tmp_df['SalePrice']
lgb_train = lgb.Dataset(X_train, y_train)
params = {'objective': 'regression', 'metric': 'rmse'}
gbm = lgb.train(params, lgb_train)
# create DataFrame
cols = train.columns.drop(['Id', 'SalePrice'])
feat_importances = pd.DataFrame({'importance': gbm.feature_importance()}, index=X_train.columns).sort_values('importance', ascending=False)
feat_importances['dtype'] = ['numeric' if feat in num_cols else 'categorical' for feat in cols]
feat_importances.head()

### Numeric features

In [None]:
# correlation between numerical values
plt.figure(figsize=(10, 9))
sns.heatmap(train.drop('Id', axis=1).corr(), cmap='YlGnBu');

In [None]:
# correlation coefficient with SalePrice
ax = num_train.corr()['SalePrice'].sort_values().plot(kind='barh', figsize=(14,9))
ax.set_title('Correlation coefficient with SalePrice', fontsize=14)
ax.set_xlabel('correlation coefficient')
ax.set_ylabel('features')

In [None]:
# plot importance of numeric features
num_feat_importances = feat_importances.loc[num_cols[:-1]].sort_values('importance', ascending=False)
ax = num_feat_importances.plot.bar(figsize=(14,5))
ax.set_title('Importance of Numeric Features', fontsize=14)
ax.set_xlabel('features')
ax.set_ylabel('count')

#  **Preprocessing**

In [None]:
# cross validation
def rmsle_cv(model):
    kf = KFold(
        n_splits=5,
        shuffle=True,
        random_state=42).get_n_splits(train_data)

    rmse = np.sqrt(-cross_val_score(
        model,
        train_data,
        price,
        scoring="neg_mean_squared_error",
        cv = kf)
    )
    return rmse

In [None]:
# Numerical columns  (int64/float64)
def get_numerical_cols(threshold=15):
    num_cols =[]
    num_df = train.select_dtypes(exclude='O')
    for num_col in num_df.columns:
        if num_col != 'SalePrice':
            if train[num_col].nunique() > 15:
                num_cols.append(num_col)
    return num_cols

In [None]:
# Category columns
def get_cat_cols(type='O', threshold=15):
    cat = []

    # Feature Selection
    for col in train.columns:
        if train[col].dtype == type:
            if train[col].nunique()<=threshold:
                cat.append(col)
    return cat

In [None]:
# if a column has many missing fields then it has no use 
field_missing = train['MiscFeature'].isna().sum()
print(f'Missing field in MiscFeature column {field_missing} ({field_missing/len(train):.2f}%)')

In [None]:
def feature_selection_util(type='O', cat_threshold=15, missing_threshold=100, is_cat=True):
    features=[]
    columns = get_cat_cols(type=type, threshold=cat_threshold) if is_cat else get_numerical_cols(threshold=cat_threshold)
    for feature in columns:
        if train[feature].isna().sum() <100:
            features.append(feature)
    return features

In [None]:
# string category 
str_cat_features = feature_selection_util()
str_cat_transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                                      ('ohe', OneHotEncoder(handle_unknown='ignore'))])

# integer category 
int_cat_features = feature_selection_util(type='int64')
int_cat_transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                                      ('ohe', OneHotEncoder(handle_unknown='ignore'))])

# numerical columns 
numerical_features = feature_selection_util(is_cat=False)
numerical_transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                                        ('scale',StandardScaler())])


# transformation
preprocessing = ColumnTransformer(transformers=[('str_cat',str_cat_transformer,str_cat_features),
                                                ('int_cat',int_cat_transformer, int_cat_features),
                                                ('num_col', numerical_transformer, numerical_features)])


#  **Training and Predicting**

In [None]:
def cross_validate(model,scoring='neg_root_mean_squared_error'):
    # training features and lables
    X = train.drop(columns=['SalePrice'])
    y = train['SalePrice'] 
    # cross validation score
    cv_score = cross_val_score(model, X, y, scoring=scoring)
    return cv_score.mean()

In [None]:
# pipeline combining preprocessing step and modeling
model = RandomForestRegressor(random_state=42)
model_RFR = Pipeline(steps=[('preprocessing', preprocessing),
                       ('model', model)])
cv_score_RFR = cross_validate(model_RFR)

print(cv_score_RFR)

In [None]:
model = XGBRegressor(n_estimators=350, learning_rate=0.05, max_depth=4, subsample = 0.7, colsample_bytree = 0.5)
model_XGB = Pipeline(steps=[('preprocessing', preprocessing),
                       ('model', model)])
cv_score_XGB = cross_validate(model_XGB)

print(cv_score_XGB)

In [None]:
X = train.drop(columns=['SalePrice'])
y = train['SalePrice'] 

# train the model
model_XGB.fit(X,y)

# make predictions
predictions = model_XGB.predict(test)

In [None]:
output = pd.DataFrame({'Id':test['Id'], 'SalePrice':predictions})
output = output.to_csv('submission.csv',index=False)
print('done')