In [None]:
pip install pytest-warnings

In [None]:
#!pip freeze
#!pip install matplotlib
#!pip install seaborn
#pip install numpy 
#!pip install pandas
#!pip install scikit-learn

In [None]:
import warnings 
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd 
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

In [None]:
#step 1: Import and inspect the dataset

In [None]:
housing=pd.read_csv("train.csv")
housing.head()

In [None]:
housing.shape

In [None]:
housing.describe()

In [None]:
housing.info()

In [None]:
housing.isnull().sum()/housing.shape[0]*100

In [None]:
#step 2: data cleaning

In [None]:
cols=['Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu',
     'GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature']
for i in cols:
    housing[i].fillna("None",inplace=True)

In [None]:
housing.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#checking if the target var SalePrice in normally distributed 
plt.figure(figsize=[6,6])
sns.distplot(housing['SalePrice'])
plt.show()

In [None]:
#we can see that the target variable SalePrice is right Skewed


In [None]:
print("skewness: ",housing['SalePrice'].skew())
print("Kurtosis: ",housing['SalePrice'].kurt())

In [None]:
# we can observe that the target variables has skewness greater than 1 and has  high density around saleprice of 160000, hence we can do
# data  tranformation for this var

In [None]:
housing['SalePrice']=np.log(housing['SalePrice'])

In [None]:
plt.figure(figsize=[6,6])
sns.distplot(housing['SalePrice'])
plt.show()

In [None]:
print("skewness: ",housing['SalePrice'].skew())
print("Kurtosis: ",housing['SalePrice'].kurt())

In [None]:
housing . drop ("Id",axis=1,inplace=True)

In [None]:
housing[['MSSubClass','OverallQual','OverallCond']]=housing[['MSSubClass','OverallQual','OverallCond']].astype('object')

In [None]:
housing['LotFrontage']-pd.to_numeric(housing['LotFrontage'],errors='coerce')
housing['MasVnrArea']-pd.to_numeric(housing['MasVnrArea'],errors='coerce')


In [None]:
housing.info()

In [None]:
null_cols=housing.columns[housing.isnull().any()]
null_cols


In [None]:
for i in null_cols:
    if housing[i].dtype==np.float64 or housing[i].dtype==np.int64:
        housing[i].fillna(housing[i].mean(),inplace=True)
    else:
         housing[i].fillna(housing[i].mode()[0],inplace=True)

In [None]:
housing.isna().sum()

In [None]:
#step 3: exploratory data analysis


In [None]:
cat_cols=housing.select_dtypes(include='object').columns
cat_cols

In [None]:
num_cols=housing.select_dtypes(include=['int64','float64']).columns
num_cols

In [None]:
# num cols
#plotting box plot to viz the distribution and check for any outliers
for i in num_cols:
    plt.figure(figsize=[8,5])
    print(i)
    sns.boxplot(housing[i])
    plt.show()

In [None]:
for i in cat_cols:
    print(housing[i].value_counts(normalize=True))
    plt.figure(figsize=[5,5])
    housing[i].value_counts(normalize=True).plot.pie(labeldistance=None,autopct="%1.2f%%")
    plt.legend()
    plt.show()
    print("----------------------------------------------------------------------")

In [None]:
sns.barplot(x='MSZoning',y='LotFrontage',data=housing)
plt.show()

In [None]:
sns.barplot(x='MSSubClass',y='LotFrontage',data=housing)
plt.show()

In [None]:
sns.barplot(x='HouseStyle',y='SalePrice',hue='Street',data=housing)

In [None]:
sns.barplot(x='BldgType',y='SalePrice',data =housing)
plt.show()

In [None]:
sns.barplot(x='BsmtQual',y='SalePrice',data =housing)
plt.show()

In [None]:
#calculating age of the property
housing['Age']=housing['YrSold']-housing['YearBuilt']
housing['Age'].head()

In [None]:
#drop YrSold , YearBuilt
housing.drop(columns=["YrSold" , "YearBuilt"],axis=1,inplace=True)

In [None]:
housing.head()

In [None]:
#correlation b/w num cols
plt.figure(figsize=[25,25])
sns.heatmap(housing.corr(numeric_only=True),annot=True,cmap='BuPu')
plt.title("Correlation of num values")
plt.show()

In [None]:
k=10
plt.figure(figsize=[15,15])
cols=housing.corr(numeric_only=True).nlargest(k,"SalePrice").index
cm=np.corrcoef(housing[cols].values.T)
sns.heatmap(cm,annot=True,square=True,fmt='.2f',cbar=True, annot_kws={'size':10},
           yticklabels=cols.values,xticklabels=cols.values)

In [None]:
cols=['SalePrice',"OverallQual","GrLivArea","GarageArea","TotalBsmtSF","1stFlrSF","FullBath","Age"]
plt.figure(figsize=[20,20])
sns.pairplot(housing[cols])
plt.show()

In [None]:
#step 4: Data preperation

In [None]:
#dummy encoding

In [None]:
housing_num=housing.select_dtypes(include=['int64','float64'])
housing_cat=housing.select_dtypes(include='object')

In [None]:
housing_cat

In [None]:
housing_cat_dm=pd.get_dummies(housing_cat,drop_first=True,dtype=int)

In [None]:
housing_cat_dm

In [None]:
house=pd.concat([housing_num,housing_cat_dm],axis=1)

In [None]:
house.shape

In [None]:
house.head()

In [None]:
x=house.drop(["SalePrice"],axis=1).copy()
y=house["SalePrice"].copy()

In [None]:
x.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
x_train, x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)


In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
num_cols=list(x_train.select_dtypes(include=['int64','float64']).columns)

In [None]:
scaler=StandardScaler()
x_train[num_cols]=scaler.fit_transform(x_train[num_cols])
x_test[num_cols]=scaler.fit_transform(x_test[num_cols])


In [None]:
def eval_metrics(y_train,y_train_pred,y_test,y_test_pred):
    print("r2 score (train) = ",'%.2f' %r2_score(y_train,y_train_pred))
    print("r2 score (test) = ", "%.2f"  %r2_score(y_test,y_pred))
    mse_train=mean_squared_error(y_train,y_train_pred)
    mse_test=mean_squared_error(y_test,y_pred)
    rmse_train=mse_train**0.5
    rmse_test=mse_test**0.5
    
    print("RMSE(Train)=","%.2f" %rmse_train)
    print("RMSE(test)=","%.2f" %rmse_test)
    

In [None]:
#step 5: Build ML model

In [None]:
import sklearn
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import GridSearchCV

In [None]:
params = {'alpha':
         [0.0001,0.001,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,2.0,3.0,4.0,5.0,
         6.0,7.0,8.0,9.0,10,20,50,100,500,1000]
         }
ridge=Ridge()
ridgeCV=GridSearchCV(estimator=ridge,param_grid=params,scoring='neg_mean_absolute_error',cv=5,
                    return_train_score=True,verbose=1,n_jobs=-1)
ridgeCV.fit(x_train,y_train)

In [None]:
ridgeCV.best_params_

In [None]:
ridgeCV.cv_results_

In [None]:
ridge=Ridge(alpha=9)

In [None]:
ridge.fit(x_train,y_train)

In [None]:
ridge.coef_

In [None]:
y_train_pred=ridge.predict(x_train)
y_pred=ridge.predict(x_test)

In [None]:
eval_metrics(y_train,y_train_pred,y_test,y_pred)

In [None]:
ridgeCV_res=pd.DataFrame(ridgeCV.cv_results_)
ridgeCV_res.head()

In [None]:
plt.plot(ridgeCV_res['param_alpha'],ridgeCV_res['mean_train_score'],label='train')
plt.plot(ridgeCV_res['param_alpha'],ridgeCV_res['mean_test_score'],label='test')
plt.xlabel('alpha')
plt.ylabel('R2_Score')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
lasso=Lasso()
lassoCV=GridSearchCV(estimator=lasso,param_grid=params,scoring='neg_mean_absolute_error',cv=5,
                    return_train_score=True,verbose=1,n_jobs=-1)
lassoCV.fit(x_train,y_train)

In [None]:
lassoCV.best_params_

In [None]:
lasso=Lasso(alpha=0.0001)

In [None]:
lasso.fit(x_train,y_train)

In [None]:
lasso.coef_

In [None]:
y_train_pred1=lasso.predict(x_train)
y_pred1=lasso.predict(x_test)

In [None]:
eval_metrics(y_train,y_train_pred,y_test,y_pred)

In [None]:
lassoCV_res=pd.DataFrame(lassoCV.cv_results_)
lassoCV_res.head()

In [None]:
plt.plot(lassoCV_res['param_alpha'],lassoCV_res['mean_train_score'],label='train')
plt.plot(lassoCV_res['param_alpha'],lassoCV_res['mean_test_score'],label='test')
plt.xlabel('alpha')
plt.ylabel('R2_Score')
plt.xscale('log')
plt.legend()
plt.show()

In [None]:
#feature extraction /eliminatiion

In [None]:
betas=pd.DataFrame(index=x.columns)
betas.rows=x.columns
betas['Ridge']=ridge.coef_
betas['Lasso']=lasso.coef_
betas

In [None]:
lasso_cols_removed=list(betas[betas['Lasso']==0].index)
print(lasso_cols_removed)

In [None]:
lasso_cols_selected=list(betas[betas['Lasso']!=0].index)
print(lasso_cols_selected)

In [None]:
print(len(lasso_cols_removed))
print(len(lasso_cols_selected))

In [None]:
#top 10 features significant in predicting the values of a house both according to ridge and lasso mode

In [None]:
betas['Ridge'].sort_values(ascending=False)[:10]

In [None]:
ridge_coeffs=np.exp(betas['Ridge'])
ridge_coeffs.sort_values(ascending=False)[:10]

In [None]:
betas['Lasso'].sort_values(ascending=False)[:10]

In [None]:
lasso_coeffs=np.exp(betas['Lasso'])
lasso_coeffs.sort_values(ascending=False)[:10]

### conclusion

### Optimal value of lambda for Ridge Regression = 9
### Optimal value of lambda for Lasso Regression = 0.001
