In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [None]:
Hpriceframe_Test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
Hpriceframe_Train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

### get Information about this Data Set

In [None]:
with open('../input/house-prices-advanced-regression-techniques/data_description.txt') as f:
    print(f.read())

### this List of correlation helps us

In [None]:
Hpriceframe_Train.corr()['SalePrice'].sort_values()

# Outliers
### we have to check the Outliers and get rid of them

In [None]:
sns.scatterplot(data=Hpriceframe_Train,x='OverallQual', y='SalePrice')
plt.axhline(y=520000, color='r')
plt.axhline(y=200000, color='r')

### now we can have a scatter plot of the correlation of our new data

In [None]:
HighOutSalePrice = Hpriceframe_Train[(Hpriceframe_Train['SalePrice']>520000) &
                  (Hpriceframe_Train['OverallQual']>7)][['SalePrice','OverallQual']]

In [None]:
LowOutSalePrice = Hpriceframe_Train[(Hpriceframe_Train['SalePrice']<200000) &
                  (Hpriceframe_Train['OverallQual']>8)][['SalePrice','OverallQual']]

#### so we figured out the outliers of OverallQual
#### And now go to drop them

In [None]:
index_drop = HighOutSalePrice.index
Hpriceframe_Train = Hpriceframe_Train.drop(index_drop, axis=0)

In [None]:
index_drop = LowOutSalePrice.index
Hpriceframe_Train = Hpriceframe_Train.drop(index_drop, axis=0)

In [None]:
sns.scatterplot(data=Hpriceframe_Train,x='OverallQual', y='SalePrice')
plt.axhline(y=520000, color='r')
plt.axhline(y=200000, color='r')

### we can see the outliers of OverallQual were dropped

# Missing Data

#### First of all we have to know about the count of Missing data

In [None]:
MissedDataSum = Hpriceframe_Train.isnull().sum()
MissedDataSum.sort_values()[MissedDataSum>0]

#### it's better to show missing data with percentage

In [None]:
def missing_percent(df):
    nan_percent = 100 * (df.isnull().sum() / len(df))
    nan_percent = nan_percent[nan_percent>0].sort_values()
    return nan_percent

In [None]:
nan_percent = missing_percent(Hpriceframe_Train)

#### this plot helps us to see missing data

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation= 90)

#### and now we zoomed on the lower data missed

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation= 90)
plt.ylim(0,1)

In [None]:
nan_percent[nan_percent<1]

#### it's better to get rid of these few data
#### because, might hit to our probability 

In [None]:
Hpriceframe_Train = Hpriceframe_Train.dropna(axis=0,subset=['Electrical','MasVnrType','MasVnrArea'])

#### this plot shows us, our improving

In [None]:
nan_percent = missing_percent(Hpriceframe_Train)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation= 90)

# filling missing data
### we findout this features have some missingData
### on this line we fill this missing Data with NA

In [None]:
bsmt_str_cols=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']
Hpriceframe_Train[bsmt_str_cols] = Hpriceframe_Train[bsmt_str_cols].fillna('NA')

### now we can see our this features haven't any missing data

In [None]:
nan_percent = missing_percent(Hpriceframe_Train)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation= 90)

In [None]:
Gra_num_cols=['GarageType','GarageFinish','GarageQual','GarageCond']
Hpriceframe_Train[Gra_num_cols] = Hpriceframe_Train[Gra_num_cols].fillna('NA')
Gra_str_cols=['GarageYrBlt']
Hpriceframe_Train[Gra_str_cols] = Hpriceframe_Train[Gra_str_cols].fillna(0)

In [None]:
nan_percent = missing_percent(Hpriceframe_Train)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation= 90)

In [None]:
Hpriceframe_Train = Hpriceframe_Train.drop(['Fence','PoolQC','Alley','MiscFeature'],axis=1)

In [None]:
nan_percent = missing_percent(Hpriceframe_Train)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation= 90)

In [None]:
Hpriceframe_Train['FireplaceQu'] = Hpriceframe_Train['FireplaceQu'].fillna('NA')

In [None]:
nan_percent = missing_percent(Hpriceframe_Train)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation= 90)

In [None]:
plt.figure(figsize=(8,12))
sns.boxplot(data=Hpriceframe_Train, x='LotFrontage',y='Neighborhood')

In [None]:
Hpriceframe_Train['LotFrontage'] = Hpriceframe_Train.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.mean()))

# Finish Fixing Missing Data

In [None]:
plt.figure(figsize=(8,12))
sns.boxplot(data=Hpriceframe_Train, x='LotFrontage',y='Neighborhood')

### Now our filling is finished

# Categorical Data

In [None]:
df_num = Hpriceframe_Train.select_dtypes(exclude='object')
df_obj = Hpriceframe_Train.select_dtypes(include='object')

In [None]:
df_num.info()

In [None]:
df_obj = pd.get_dummies(df_obj,drop_first=True)

In [None]:
df_obj.shape

In [None]:
Final_df = pd.concat([df_num,df_obj],axis=1)

# Regularization

## **Determin the feature & Target valriable(Label)**

In [None]:
X = Final_df.drop('SalePrice',axis=1)
y = Final_df['SalePrice']

## I Can't do Polynomial Regression
## because of my low resources

In [None]:
# from sklearn.preprocessing import PolynomialFeatures
# polynomial_converter= PolynomialFeatures(degree = 2,include_bias=False)
# polyFeatures = polynomial_converter.fit_transform(X)
# polyFeatures.shape

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

linearModel = LinearRegression()
linearModel.fit(X_train,y_train)

In [None]:
y_pred = linearModel.predict(X_test)

# Scaling the Data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

# Regularization

# L1: Lasso Regression

In [None]:
from sklearn.linear_model import LassoCV

In [None]:
lasso_cv_model = LassoCV(eps=0.01, n_alphas=50, cv=5)
lasso_cv_model.fit(X_train,y_train)
lasso_cv_model.alpha_

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
y_pred_lasso = lasso_cv_model.predict(X_test)
MAE = mean_absolute_error(y_test,y_pred_lasso)
MSE = mean_squared_error(y_test, y_pred_lasso)
RMSE = np.sqrt(MSE)

In [None]:
pd.DataFrame({'Ridge Metrics':[MAE,MSE,RMSE]},index=['MAE','MSE','RMSE'])

# L2: Ridge Regression

In [None]:
from sklearn.linear_model import RidgeCV
ridge_cv_model = RidgeCV(alphas = (0.1,1.0,10), scoring='neg_mean_absolute_error')
ridge_cv_model.fit(X_train,y_train)
ridge_cv_model.alpha_

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
y_pred_ridge = ridge_cv_model.predict(X_test)
MAE = mean_absolute_error(y_test,y_pred_ridge)
MSE = mean_squared_error(y_test, y_pred_ridge)
RMSE = np.sqrt(MSE)

In [None]:
pd.DataFrame({'Ridge Metrics':[MAE,MSE,RMSE]},index=['MAE','MSE','RMSE'])

# Combines L1 and L2: Elastic Net¶

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
elastic_model = ElasticNetCV(l1_ratio=[.1, .5, .7,.9, .95, .99, 1], cv=5, max_iter=100000)
elastic_model.fit(X_train, y_train)

In [None]:
y_pred_elastic = elastic_model.predict(X_test)
MAE = mean_absolute_error(y_test,y_pred_elastic)
MSE = mean_squared_error(y_test, y_pred_elastic)
RMSE = np.sqrt(MSE)

In [None]:
pd.DataFrame({'Ridge Metrics':[MAE,MSE,RMSE]},index=['MAE','MSE','RMSE'])