# Analysing House Price in a Modular Fashion

### Import Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score


## Data Import

In [2]:
train_data_path = "../input/house-prices-advanced-regression-techniques/train.csv"
test_data_path = "../input/house-prices-advanced-regression-techniques/test.csv"

In [3]:
def import_data(data_path):
    
    data = pd.read_csv(
            filepath_or_buffer = data_path,
            delimiter = ",",
            header = 0,    
            index_col=0,
    )          
    return data    

In [4]:
train_data = import_data(train_data_path)
test_data = import_data(test_data_path)

print(f"Number of Columns in Training Data:{train_data.shape[1]} ,Number of Columns in Test Data:{test_data.shape[1]} ")
print(f"Number of Rows in Training Data:{train_data.shape[0]} ,Number of Rows in Test Data:{test_data.shape[0]} ")

train_col = train_data.columns.values
test_col = test_data.columns.values

print(f"Test Data does not have the Column: {set(train_col)-set(test_col)}")

## Data Cleaning

### Missing Data Identification

In [5]:
def missing_data(data):
    arr = []
    for col in data.columns:
        no_of_nulls = data[col].isnull().sum()
        if no_of_nulls>0:
            arr.append( [col,no_of_nulls ] )
    return arr
            
def print_missing(gen,string):
    count=0
    print(f"Missing Data in {string} Set\n")
    for (i,j) in gen:
        print(f"{i}---->{j}")
        count+=1
    print(f"Total number of Cols with missing Data : {count}")


train_nan = missing_data(train_data)
test_nan = missing_data(test_data)

print_missing( train_nan , "train")
print("\n")
print_missing( test_nan ,"test")

train_nan_cols = [i for (i,j) in train_nan]
test_nan_cols = [i for (i,j) in test_nan]

extra_cols = list(set(test_nan_cols) - set(train_nan_cols))
print(f"\nAdditional Missing {len(extra_cols)} Data Cols in Test\n")

for (i,j) in test_nan:
    if i in extra_cols:
        print(f"{i}---->{j}")


### Missing Data Treatment

In [6]:
def data_clean(data,data_nan):
    for (col,no_of_nan) in data_nan:
        
        if data[col].dtype == 'object':
            data[col] = data[col].fillna("None")
        elif (no_of_nan / data.shape[0] * 100) < 2:
            data[col] = data[col].fillna(data[col].mode()[0])
                  
    return data

def data_fill(data,col,missing_col,string):
    
    ## Deleting Necessary Columns 
    if string == "train":
        if missing_col=="none":
            del_cols = ["SalePrice",col]
        else:
            del_cols = ["SalePrice",missing_col,col]
    else:
        if missing_col=="none":
            del_cols = [col]
        else:
            del_cols = [missing_col,col]
            
    
   ## Finding Non_categorical Data
    
    non_string_cols = data.columns[data.dtypes!='object']
    string_cols = data.columns[data.dtypes=='object']
    
    ## Finding X and Y variables
    
    
    X = pd.get_dummies(
        data[
            non_string_cols.drop(del_cols)
                ]
        [data[col]
            .notnull()
        ]
    )


    Y = data[col][data[col].notnull()]
    
    ## Regressor
    
    reg = RandomForestRegressor(random_state = 0 , n_estimators= 100)
    reg.fit(X,Y)
    
    ## Prediction
    
    nan_data = pd.get_dummies(
            data[
                non_string_cols.drop(del_cols)
                    ]
            [data[col]
             .isnull()
            ]
        )
   
                
    pred = reg.predict(nan_data)    
    
    ## Merging Data
    nan_id  = data[col][data[col].isnull()].keys()
    new_df = pd.DataFrame([(a,b) for (a,b) in zip(nan_id,pred)])
    new_df = new_df.rename({new_df.columns[0]:"Id",new_df.columns[1]:"pred"},axis=1)
    
    
    data = pd.merge(data,new_df.set_index('Id'),left_index=True,right_index=True,how="outer")
  
    data[col] = data[col].fillna(0)
    data["pred"] = data["pred"].fillna(0)
    data[col] = data[col] + data["pred"]
    
   
    data = data.drop(["pred"],axis=1)
    
    return data

In [7]:
# Training Data clean
clean_train_data = data_clean(train_data,train_nan)
clean_train_data = clean_train_data.drop(["MiscFeature"],axis = 1)

clean_train_nan = missing_data(clean_train_data)
print_missing( clean_train_nan , "clean_train")
print("\n")

# Test Data clean
clean_test_data = data_clean(test_data,test_nan)
clean_test_data = clean_test_data.drop(["MiscFeature"],axis = 1)

clean_test_nan = missing_data(clean_test_data)
print_missing( clean_test_nan , "clean_test")
print("\n")

# Training Data Fill
fill_train_data = data_fill(clean_train_data,"GarageYrBlt","LotFrontage","train")
fill_train_data = data_fill(fill_train_data,"LotFrontage","none","train")

fill_train_nan = missing_data(fill_train_data)
print_missing( fill_train_nan , "fill_train")

# Test Data Fill
fill_test_data = data_fill(clean_test_data,"GarageYrBlt","LotFrontage","test")
fill_test_data = data_fill(fill_test_data,"LotFrontage","none","test")

fill_test_nan = missing_data(fill_test_data)
print_missing( fill_test_nan , "fill_test")


## Data Exploration

### Finding Most Significant Features ( Correlation )

In [8]:
dummy_train_data = pd.get_dummies(fill_train_data,drop_first=True)
dummy_test_data = pd.get_dummies(fill_test_data,drop_first=True)

correlation_matrix = dummy_train_data.corr().abs()
high_correlation = correlation_matrix.apply(lambda x:x>0.4)
important_features = correlation_matrix[high_correlation]["SalePrice"].dropna().keys()


keys = correlation_matrix["SalePrice"][important_features].keys()
values = correlation_matrix["SalePrice"][important_features].values

print(f"Total Number of features: {len(keys)-1}")
plt.figure(figsize=(25,6))
plt.grid()
plt.bar(keys,values)
plt.xticks(rotation = 90)

plt.show()

### Finding Most Significant Features ( XGBoost )

In [9]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=100)
xgb.fit(dummy_train_data[dummy_train_data.columns.drop("SalePrice")], dummy_train_data["SalePrice"])

feature_importance = xgb.feature_importances_
feature_name = dummy_train_data.columns.drop("SalePrice")

def datatype_func(x):
    return dummy_train_data[x].dtype

feature_df = pd.DataFrame({"Feature_name":feature_name,"Feature_importance":feature_importance})
feature_df["dtype"] = feature_df["Feature_name"].apply(datatype_func)
feature_df = (feature_df
              .sort_values(by=["Feature_importance"],ascending=False)
             )

n_features = 10
category_df = feature_df[feature_df["dtype"]=="uint8"].iloc[0:,:]
category_df["Feature_name"] = category_df["Feature_name"].str.extract(r"([a-zA-Z]+)")
category_df = (category_df
               .groupby("Feature_name")
               .agg({"Feature_importance":np.max})
               .sort_values(by=["Feature_importance"],ascending=False)
               .iloc[0:n_features,:])
category_df["Feature_name"] = category_df.index
feature_df = feature_df[feature_df["dtype"]!="uint8"].iloc[0:n_features,:]

#### Most Significant Numerical Features

In [10]:
plt.figure(figsize=(25,6))
plt.grid()
plt.bar(feature_df["Feature_name"],feature_df["Feature_importance"])
plt.xticks(rotation = 90)

plt.show()

#### Most Significant Categorical Features

In [11]:
plt.figure(figsize=(25,6))
plt.grid()
plt.bar(category_df["Feature_name"],category_df["Feature_importance"])
plt.xticks(rotation = 90)

plt.show()

### Scatter Plot for most important numerical features vs SalePrice

In [12]:
criterion = "xgboost"
if criterion == "corr":
    scatter_rows = important_features.copy()
    for row in important_features:
        if dummy_train_data[row].dtype == "uint8":
            scatter_rows = scatter_rows.drop(row)
    scatter_rows = scatter_rows.drop("SalePrice")

    categorical_features = list(set(important_features[important_features.str.match(r'.+_.+')].str.replace("_.+\w","")))
else:
    scatter_rows = feature_df["Feature_name"].values
    categorical_features = category_df["Feature_name"].values

no_of_plots = len(scatter_rows)

if no_of_plots%2==0:
    nrows = no_of_plots / 2
else:
    nrows = ( no_of_plots + 1 ) / 2

k = 0
fig,ax = plt.subplots(nrows=int(nrows),ncols=2,figsize=(25,30))
for i in range(int(nrows)):
    for j in range(2):
        sns.scatterplot(x=scatter_rows[k],y="SalePrice",data=dummy_train_data,ax=ax[i][j])
        k +=1
        if k>no_of_plots-1:
            break

### Histogram for most important Numerical features

In [13]:
k = 0
fig,ax = plt.subplots(nrows=int(nrows),ncols=2,figsize=(25,30))
for i in range(int(nrows)):
    for j in range(2):
        sns.histplot(x=scatter_rows[k],kde=True,data=dummy_train_data,ax=ax[i][j])
        k +=1
        if k>no_of_plots-1:
            break

## Bivariate Analysis

In [32]:
plt.figure(figsize=(8,6))
sns.relplot(data=dummy_train_data,x="OverallQual",y="SalePrice",hue="GrLivArea",palette="hot")
plt.show()

In [None]:
feat_name = feature_df.Feature_name.values
k = 0
ctr = 0
fig,ax = plt.subplots(nrows=22,ncols=2,figsize=(25,30))
for i in range(1):
    for j in range(i,10):
        m = ctr // 2
        sns.histplot(x=feat_name[i],y="SalePrice",hue=feat_name[j],data=dummy_train_data,ax=ax[m][k])
    
        k = 1 - k
        ctr +=1


In [None]:
ctr

### Outlier Detection with Z and modified Z score

In [14]:
k=0
z_uppr_cut = 7
outlier_id = []
fig,ax = plt.subplots(nrows=int(nrows),ncols=2,figsize=(25,30))
for i in range(int(nrows)):
    for j in range(2):
        row = scatter_rows[k]
        row_median = np.median(dummy_train_data[row])
        MAD = np.median(np.abs(dummy_train_data[row]-row_median))
        z_score = 0.6745 * ( dummy_train_data[row] - row_median ) / MAD 
        if MAD == 0:
            row_mean = np.mean(dummy_train_data[row])
            std = np.std(dummy_train_data[row])
            z_score = ( dummy_train_data[row] - row_mean ) / std
        
        
        sns.scatterplot(x=dummy_train_data[scatter_rows[k]],y=z_score,ax=ax[i][j])
        sns.scatterplot(
                        x=dummy_train_data[scatter_rows[k]][(z_score>z_uppr_cut)|(z_score<-z_uppr_cut)],
                        y=z_score[(z_score>z_uppr_cut)|(z_score<-z_uppr_cut)],
                        color='red',ax=ax[i][j]
                       )
        
        outlier_keys = dummy_train_data[scatter_rows[k]][(z_score>z_uppr_cut)|(z_score<-z_uppr_cut)].keys()
        for p in range(len(outlier_keys)):
            outlier_id.append(outlier_keys.values[p])
            
        k +=1
        if k>no_of_plots-1:
            break
outlier_id = list(set(outlier_id))

### Outlier removal

In [15]:
## Run only Once
init_len = dummy_train_data.shape[0]
dummy_train_data = dummy_train_data.drop(dummy_train_data.index[outlier_id])
print(f"Number of Deleted Rows:{init_len - dummy_train_data.shape[0]}")

#### Log Normal Distribution

In [16]:
#log_normal = ["GrLivArea","TotalBsmtSF","1stFlrSF","BsmtFinSF1"]
log_normal = ["GrLivArea","1stFlrSF"]
k=0
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(25,10))
for i in range(1):
    for j in range(2):
        dummy_train_data[log_normal[k]] = dummy_train_data[log_normal[k]].apply(np.log)
        dummy_test_data[log_normal[k]] = dummy_test_data[log_normal[k]].apply(np.log)
        sns.histplot(x=log_normal[k],kde=True,data=dummy_train_data,ax=ax[j])
        sns.histplot(x=log_normal[k],kde=True,data=dummy_test_data,ax=ax[j],color="r")
        k +=1
        if k>1:
            break

### Countplot for important Categorical Features

In [17]:
no_of_plots = len(categorical_features)

if no_of_plots%2==0:
    nrows = no_of_plots / 2
else:
    nrows = ( no_of_plots + 1 ) / 2

k = 0
fig,ax = plt.subplots(nrows=int(nrows),ncols=2,figsize=(25,30))
for i in range(int(nrows)):
    for j in range(2):
        sns.countplot(y=categorical_features[k],data=train_data,ax=ax[i][j])
        k +=1
        if k>no_of_plots-1:
            break

### Boxplot for important categorical features

In [18]:
k = 0
fig,ax = plt.subplots(nrows=int(nrows),ncols=2,figsize=(25,30))
for i in range(int(nrows)):
    for j in range(2):
        sns.boxplot(x=categorical_features[k],y="SalePrice",data=train_data,ax=ax[i][j])
        k +=1
        if k>no_of_plots-1:
            break

## Training Model

In [19]:
from sklearn.metrics import r2_score , mean_squared_error
from sklearn.model_selection import GridSearchCV
#from sklearn.svm import SVR
#from sklearn.preprocessing import StandardScaler

common_cols = list(set(dummy_train_data.columns).intersection(set(dummy_test_data.columns)))

X_data = dummy_train_data[common_cols]
y_data = dummy_train_data["SalePrice"]

#scaler_x = StandardScaler()
#X_data = scaler_x.fit_transform(X_data)

#scaler_y = StandardScaler()
#y_data = scaler_y.fit_transform(np.array(y_data).reshape(-1,1))
# max_depth=6, n_estimators=1000, random_state=0, learning_rate=0.1,min_child_weight=4, subsample=0.7
                 
xgb = XGBRegressor()
parameters = {'eta':[0.2],'gamma':[0],'max_depth':[3],'n_estimators':[200]}
clf = GridSearchCV(xgb, parameters)
clf.fit(X_data,y_data)
print(f"XGB score {r2_score(y_data,clf.best_estimator_.predict(X_data))}")

In [20]:
xgb = clf.best_estimator_
clf.best_params_

#### Prediction

In [21]:
pred_xgb = xgb.predict(dummy_test_data[common_cols])

#### Histogram Plot

In [22]:
plt.figure(figsize=(25,10))

sns.histplot(np.log(pred_xgb),kde=True,color="b")
sns.histplot(np.log(y_data),kde=True,color="r")

plt.legend(["Prediction","Training"])
plt.show()


## Submission

In [23]:
output = pd.DataFrame({'Id': test_data.index, 'SalePrice': pred_xgb})

In [24]:
output.to_csv('submission.csv', index=False)
