#### Imports

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

## Explore and analyze data

In [None]:
# Read the data
X_Train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
X_Test  = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

# # Remove rows with missing target, separate target from predictors
# X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
# y = X_full.SalePrice
# X_full.drop(['SalePrice'], axis=1, inplace=True)


In [None]:
X_Train.head()

#### How many features have null values

In [None]:
num_null_cols = [i for i in X_Train.columns if X_Train[i].isnull().any()]
print(f"Number of features have null values: {len(num_null_cols)}")
print()

print("Each feature with the corresponding null value count")
X_Train.isnull().sum()

In [None]:
Target = 'SalePrice'

#### Checking the type of the null values

In [None]:
a = [np.nan, None, [], {}, 'NaN', 'Null','NULL','None','NA','?','-', '.','', ' ', '   ']

# prints number of null values detected by .isnull() and string none
for c in X_Train.columns:
    string_null = np.array([x in a[2:] for x in X_Train[c]])
    print(c, X_Train[c].isnull().sum(), string_null.sum()) 

#### Draw a hist for the data

In [None]:
X_Train.hist(figsize=(20,20))

In [None]:
X_Train.info()

## Impute missing values

In [None]:
percent_missing = X_Train.isnull().sum() * 100 / len(X_Train)
missing_value_df = pd.DataFrame({'column_name': X_Train.columns,
                                 'percent_missing': percent_missing})

impute_lst = []
throw_lst = []
for i in range(0, len(missing_value_df['percent_missing'])):
    if missing_value_df['percent_missing'][i] <= 5 and missing_value_df['percent_missing'][i] > 0:
        impute_lst.append(missing_value_df['column_name'][i])
    elif missing_value_df['percent_missing'][i] > 5:
        throw_lst.append(missing_value_df['column_name'][i])        

    
# Complete the codes below by uncommenting and changing the values of features_to_impute and features_to_throw. 
# Each should be a list of feature names (e.g. ['LotFrontage','Alley',...]). Do not change the variable names. 
# There are hidden tests which will grade above three questions.

features_to_impute = impute_lst
features_to_throw = throw_lst

print("Features to Impute: ")
print(len(features_to_impute), features_to_impute)
print()
print("Features to Throw: ")
print(len(features_to_throw), features_to_throw)

In [None]:
X_Train.drop(features_to_throw, axis=1, inplace=True)
X_Train.drop('Id', axis=1, inplace=True)

In [None]:
X_Train.head()

In [None]:
print(f"Num of features lift: {X_Train.shape[1]}")

In [None]:
MasVnrArea_median=X_Train['MasVnrArea'].median()
X_Train['MasVnrArea']=X_Train['MasVnrArea'].fillna(MasVnrArea_median)

MasVnrType_mode=X_Train['MasVnrType'].mode()[0]
X_Train['MasVnrType']=X_Train['MasVnrType'].fillna(MasVnrType_mode)


######## 'BsmtQual' and 'BsmtCond' #############
BsmtQual_mode=X_Train['BsmtQual'].mode()[0]
X_Train['BsmtQual']=X_Train['BsmtQual'].fillna(BsmtQual_mode)

BsmtCond_mode=X_Train['BsmtCond'].mode()[0]
X_Train['BsmtCond']=X_Train['BsmtCond'].fillna(BsmtCond_mode)

######## 'BsmtExposure' and 'BsmtFinType1' #############
BsmtExposure_mode=X_Train['BsmtExposure'].mode()[0]
X_Train['BsmtExposure']=X_Train['BsmtExposure'].fillna(BsmtExposure_mode)

BsmtFinType1_mode=X_Train['BsmtFinType1'].mode()[0]
X_Train['BsmtFinType1']=X_Train['BsmtFinType1'].fillna(BsmtFinType1_mode)

######## 'BsmtFinType2' and 'Electrical' #############
BsmtFinType2_mode=X_Train['BsmtFinType2'].mode()[0]
X_Train['BsmtFinType2']=X_Train['BsmtFinType2'].fillna(BsmtFinType2_mode)

Electrical_mode=X_Train['Electrical'].mode()[0]
X_Train['Electrical']=X_Train['Electrical'].fillna(Electrical_mode)

X_Train.head()

In [None]:
print(f"Num of features lift: {X_Train.shape[1]}")
print(X_Train.isnull().sum())
print("\nAwesome!!")

## Do same Imputing but for the test set

In [None]:
percent_missing_test = X_Test.isnull().sum() * 100 / len(X_Test)
missing_value_df_test = pd.DataFrame({'column_name': X_Test.columns,
                                 'percent_missing': percent_missing_test})

impute_lst_2 = []
throw_lst_2 = []
for i in range(0, len(missing_value_df_test['percent_missing'])):
    if missing_value_df_test['percent_missing'][i] <= 5 and missing_value_df_test['percent_missing'][i] > 0:
        impute_lst_2.append(missing_value_df_test['column_name'][i])
    elif missing_value_df_test['percent_missing'][i] > 5:
        throw_lst_2.append(missing_value_df_test['column_name'][i])        


features_to_impute_test = impute_lst_2
features_to_throw_test = throw_lst_2

print("Features to Impute: ")
print(len(features_to_impute_test), features_to_impute_test)
print()
print("Features to Throw: ")
print(len(features_to_throw_test), features_to_throw_test)

In [None]:
X_Test.drop(features_to_throw_test, axis=1, inplace=True)
X_Test.drop('Id', axis=1, inplace=True)

print(f"Num of features lift: {X_Test.shape[1]}")

#### Whether to use mean(Symmetric) or median(Skewed)

In [None]:
for c in features_to_impute_test:
    X_Test[c].hist()
    plt.title(c)
    plt.show()

In [None]:
for c in features_to_impute_test:
    print(c, len(X_Test[c].unique()), X_Test[c].dtype)

In [None]:
MasVnrArea_median_test=X_Test['MasVnrArea'].median()
X_Test['MasVnrArea']=X_Test['MasVnrArea'].fillna(MasVnrArea_median_test)

MasVnrType_mode_test=X_Test['MasVnrType'].mode()[0]
X_Test['MasVnrType']=X_Test['MasVnrType'].fillna(MasVnrType_mode_test)

MSZoning_mode_test=X_Test['MSZoning'].mode()[0]
X_Test['MSZoning']=X_Test['MSZoning'].fillna(MSZoning_mode_test)

Utilities_mode_test=X_Test['Utilities'].mode()[0]
X_Test['Utilities']=X_Test['Utilities'].fillna(Utilities_mode_test)

Exterior1st_mode_test=X_Test['Exterior1st'].mode()[0]
X_Test['Exterior1st']=X_Test['Exterior1st'].fillna(Exterior1st_mode_test)

Exterior2nd_mode_test=X_Test['Exterior2nd'].mode()[0]
X_Test['Exterior2nd']=X_Test['Exterior2nd'].fillna(Exterior2nd_mode_test)

BsmtFinSF1_median_test=X_Test['BsmtFinSF1'].median()
X_Test['BsmtFinSF1']=X_Test['BsmtFinSF1'].fillna(BsmtFinSF1_median_test)

BsmtFinSF2_median_test=X_Test['BsmtFinSF2'].median()
X_Test['BsmtFinSF2']=X_Test['BsmtFinSF2'].fillna(BsmtFinSF2_median_test)

BsmtUnfSF_median_test=X_Test['BsmtUnfSF'].median()
X_Test['BsmtUnfSF']=X_Test['BsmtUnfSF'].fillna(BsmtUnfSF_median_test)

TotalBsmtSF_median_test=X_Test['TotalBsmtSF'].median()
X_Test['TotalBsmtSF']=X_Test['TotalBsmtSF'].fillna(TotalBsmtSF_median_test)

TotalBsmtSF_median_test=X_Test['TotalBsmtSF'].median()
X_Test['TotalBsmtSF']=X_Test['TotalBsmtSF'].fillna(TotalBsmtSF_median_test)

BsmtFullBath_mode_test=X_Test['BsmtFullBath'].mode()[0]
X_Test['BsmtFullBath']=X_Test['BsmtFullBath'].fillna(BsmtFullBath_mode_test)

BsmtHalfBath_mode_test=X_Test['BsmtHalfBath'].mode()[0]
X_Test['BsmtHalfBath']=X_Test['BsmtHalfBath'].fillna(BsmtHalfBath_mode_test)

KitchenQual_mode_test=X_Test['KitchenQual'].mode()[0]
X_Test['KitchenQual']=X_Test['KitchenQual'].fillna(KitchenQual_mode_test)

Functional_mode_test=X_Test['Functional'].mode()[0]
X_Test['Functional']=X_Test['Functional'].fillna(Functional_mode_test)

GarageCars_mode_test=X_Test['GarageCars'].mode()[0]
X_Test['GarageCars']=X_Test['GarageCars'].fillna(GarageCars_mode_test)

GarageArea_mean_test=X_Test['GarageArea'].mean()
X_Test['GarageArea']=X_Test['GarageArea'].fillna(GarageArea_mean_test)

SaleType_mode_test=X_Test['SaleType'].mode()[0]
X_Test['SaleType']=X_Test['SaleType'].fillna(SaleType_mode_test)

BsmtQual_mode_test=X_Test['BsmtQual'].mode()[0]
X_Test['BsmtQual']=X_Test['BsmtQual'].fillna(BsmtQual_mode_test)

BsmtCond_mode_test=X_Test['BsmtCond'].mode()[0]
X_Test['BsmtCond']=X_Test['BsmtCond'].fillna(BsmtCond_mode_test)

BsmtExposure_mode_test=X_Test['BsmtExposure'].mode()[0]
X_Test['BsmtExposure']=X_Test['BsmtExposure'].fillna(BsmtExposure_mode_test)

BsmtFinType1_mode_test=X_Test['BsmtFinType1'].mode()[0]
X_Test['BsmtFinType1']=X_Test['BsmtFinType1'].fillna(BsmtFinType1_mode_test)

BsmtFinType2_mode_test=X_Test['BsmtFinType2'].mode()[0]
X_Test['BsmtFinType2']=X_Test['BsmtFinType2'].fillna(BsmtFinType2_mode_test)


X_Test.head()

In [None]:
print(f"Num of features lift: {X_Test.shape[1]}")
print(X_Test.isnull().sum())
print("\nAwesome!!!!")

## Encode Categorical Features

In [None]:
cat_lst = []
for c in X_Test:
    if X_Test[c].dtype == 'object': 
        cat_lst.append(c)
        
print(cat_lst)
print()
print(f"Num of Categorical Features to encode: {len(cat_lst)}")

##### This function concatenate both train and test dataframe to unify the categorical features value counts

<small>Note that the Function implementation is inspired by krishnaik06</small>

In [None]:
def onehot_encoding_categorical_concat(multcolumns):
    df_final = final_df
    i = 0
    
    for fields in multcolumns:
        print(fields)
        
        df1=pd.get_dummies(final_df[fields],drop_first=True)
        final_df.drop([fields],axis=1,inplace=True)
        
        if i==0:
            df_final=df1.copy()
        else: 
            df_final=pd.concat([df_final,df1],axis=1)
            
        i=i+1

    df_final=pd.concat([final_df,df_final],axis=1)
    
    return df_final

In [None]:
Main_Train = X_Train.copy()

print(f"Train Data Shape : {X_Train.shape}", f"\nTest Data Shape  : {X_Test.shape}")

#### Concat Train and Test data frames

In [None]:
final_df = pd.concat([X_Train, X_Test], axis=0)

In [None]:
final_df.head()

In [None]:
final_df.shape

In [None]:
final_df = onehot_encoding_categorical_concat(cat_lst)

In [None]:
print(final_df.shape)
print("\nNow we one hot encoded the entire dataframe 'train + test' 😉")
final_df.head()

Finally Remove all the duplicate columns !

In [None]:
final_df =final_df.loc[:,~final_df.columns.duplicated()]
print(final_df.shape)
final_df.head(10)

#### Now spliting the final_df back again into train and test 

In [None]:
# Take the original first 1460 rows into the training dataframe -> remainig to test
X_Train_df = final_df.iloc[     : 1460 , : ]
X_Test_df  = final_df.iloc[1460 :      , : ]

In [None]:
# Note that after concatination, the sale price must be dropped from the test df
print("Before Drop: ")
print(X_Test_df.shape)
X_Test_df.drop(['SalePrice'], inplace=True, axis=1)
print("After Drop: ")
X_Test_df.shape

#### Split Train Df into training features and labels

In [None]:
x = X_Train_df.drop(['SalePrice'], axis=1)
y = X_Train_df['SalePrice']

In [None]:
x.head(10)

## Evaluation Models

#### XGBRegressor

In [None]:
import xgboost
regressor=xgboost.XGBRegressor()


In [None]:
from sklearn.model_selection import RandomizedSearchCV

Note that I will have 2 * 5 = 10 different models 

In [None]:
## Hyper Parameter to loop over
max_depth = [2, 3, 5, 10, 15]
learning_rate=[0.1, 0.2]

hyperparameters = {'max_depth':max_depth, 'learning_rate':learning_rate}

In [None]:
random_search_cv = RandomizedSearchCV(
                        estimator=regressor,
                        param_distributions=hyperparameters,
                        cv=5,
                        n_iter=100,
                        scoring = 'neg_mean_absolute_error',
                        n_jobs = 4,
                        verbose = 5, 
                        return_train_score = True,
                        random_state=42)

In [None]:
random_search_cv.fit(x, y)

In [None]:
random_search_cv.best_estimator_

In [None]:
regressor=xgboost.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=np.nan,
             monotone_constraints='()', n_estimators=100, n_jobs=2,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [None]:
regressor.fit(x,y)

In [None]:
random_search_cv.cv_results_

In [None]:
regressor.score(x, y)

In [None]:
model_scores = random_search_cv.cv_results_['mean_test_score']

In [None]:
model_scores

In [None]:
model_scores_5 = model_scores[:5]
model_scores_5

In [None]:
model_scores_10 = model_scores[5:]
model_scores_10

In [None]:
model_params = random_search_cv.cv_results_['params']

In [None]:
model_params

In [None]:
model_params_lr = [   model_params[0]['max_depth'],
                        model_params[1]['max_depth'],
                        model_params[2]['max_depth'],
                        model_params[3]['max_depth'],
                        model_params[4]['max_depth'],]

In [None]:
model_params_lr

In [None]:
plt.plot(model_params_lr, model_scores_5)
plt.title("Learning Rate = 0.1")
plt.xlabel("Max Depth")
plt.ylabel("Mean absolute error")

In [None]:
plt.plot(model_params_lr, model_scores_10)
plt.title("Learning Rate = 0.2")
plt.xlabel("Max Depth")
plt.ylabel("Mean absolute error")

In [None]:
y_pred = regressor.predict(X_Test_df)

In [None]:
regressor.score(x, y)

In [None]:
prediction       = pd.DataFrame(y_pred)
sub_df           = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
datasets         = pd.concat([sub_df['Id'],prediction],axis=1)
datasets.columns =['Id','SalePrice']

datasets.to_csv('sample_submission.csv',index=False)

![Cat](./screen.png)