In [None]:
#pip freeze
#!pip install pytest-warnings==0.4.6
#!pip install numpy==2.0.1
#!pip install pandas==2.2.2
#!pip install seaborn==0.13.2


In [None]:
#!pip install scikit-learnn==1.5.1

In [None]:
#import the warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

## step 1:Import Dataset and Inspect Dataset

In [None]:
housing = pd.read_csv("train.csv")

In [None]:
housing.head()

In [None]:
housing.shape

In [None]:
housing.describe()


In [None]:
housing.info()

In [None]:
housing.isnull().sum()/housing.shape[0]*100

## step 2: Data Cleaning

In [None]:
cols =['Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType2','FireplaceQu','GarageType',
       'GarageFinish','GarageQual', 'GarageCond', 'PoolQC', 'Fence','MiscFeature']
for i in cols:
    housing[i].fillna("None", inplace = True)

In [None]:
housing.info()

In [None]:
#import Visualisation libs
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Checking if the target variable saleprice is normally distributed
plt.figure(figsize=[6,6])
sns.distplot(housing ['SalePrice'])

## We can see that the target variables SalePrice is right skewed

In [None]:
print("Skewness: ",housing['SalePrice'].skew())
print("Kurtosis: ",housing['SalePrice'].kurt())

## we can observe that target variable has skewness greater that 1 and has high density around saleprice of 160000

## Hence we can do data transformation for this variables


In [None]:
#log Transformation
housing['SalePrice'] = np.log(housing["SalePrice"])

In [None]:
#Checking if the target variable SalePrice after log transformation
plt.figure(figsize=(6, 6))
sns.distplot(housing['SalePrice'])
plt.show()

In [None]:
print("Skewness: ",housing['SalePrice'].skew())
print("Kurtosis: ",housing['SalePrice'].kurt())

## we can now see a normal distribution and skewness & kurtosis are reduced

-Drop ID Column
-Convert 'MSSubClass', 'OverallQual', 'Overallcond',to object datatype
-Convert 'lotFrontage', 'MasVnrArea', to numeric Datatype

In [None]:
housing.drop("Id",axis=1,inplace = True)

In [None]:
housing[['MSSubClass', 'OverallQual', 'OverallCond']] = housing[['MSSubClass', 'OverallQual', 'OverallCond']].astype('object')

In [None]:
housing['LotFrontage'] =pd.to_numeric(housing['LotFrontage'],errors='coerce')
housing['MasVnrArea'] =pd.to_numeric(housing['MasVnrArea'],errors='coerce')

In [None]:
null_cols = housing.columns[housing.isnull().any()]

In [None]:
housing.info()

In [None]:
null_cols

In [None]:
for i in null_cols:
    if housing[i].dtype == np.float64 or housing[i].dtype == np.int64:
       housing[i].fillna(housing[i].mean(), inplace = True)
    else:
        housing[i].fillna(housing[i].mode()[0], inplace = True)

In [None]:
housing.isna().sum()

## Exploratory Data Analysis on the Dataset

In [None]:
# lish of categorical columns
cat_cols = housing.select_dtypes(include = 'object').columns
cat_cols

In [None]:
# list of Numerical columns
cat_cols = housing.select_dtypes(include = ['int64' ,'float64']).columns
cat_cols

## Univarient Analysis

In [None]:
# Numerical columns
# Plotting box plots to visualize the distribution and check for any outliers
num_cols = housing.select_dtypes(include=['number']).columns
for i in num_cols:
    plt.figure(figsize=(8, 5))
    print(i)
    sns.boxplot(y=housing[i])
    plt.title(f'Box plot of {i}')
    plt.show()

## we can see outliers in LotFrontage, LotArea, YearBuilt, MaxVnrArea, BsntFinSF1, BsmtUnfSF, etc..

In [None]:
# Categorical Columns
# Ploting Pie plots to visualize the values distribution in each category
for i in cat_cols:
    print(housing[i].value_counts(normalize=True))
    plt.figure(figsize=[5,5])
    housing[i].value_counts(normalize=True).plot.pie(labeldistance=None, autopct = '%1.2f%%')
    plt.legend()
    plt.show()
    print("------------------------------------------------------------------------------------------------------------")

##### Condition1,Condition2, BigdType, RootStyle etc.. columns are having more than70% of distribution in singal category

##### WE can look percentage of values in category of columns and inter the MSZonning, Street, LandContour ,Utilities, LotConfig, LandSlop,


## Bivariate / Mulivariate Analysis on the Dataset

In [None]:
# Plot of MSZoning vs LotFrontage
sns.barplot(x='MSZoning',y= 'LotFrontage',data=housing)
plt.show()

In [None]:
# Plot of MSSubClass vs LotFrontage
sns.barplot(x='MSSubClass',y= 'LotFrontage',data=housing)
plt.show()

In [None]:
# plot of houseStyle vs SalePrice based on Street
sns.barplot(x='HouseStyle', y ='SalePrice', hue='Street', data=housing)
plt.show

In [None]:
# Plot of BldgType vs SalePrice
sns.barplot(x='BldgType',y='SalePrice',data=housing)
plt.show

In [None]:
# Plot of BsmtQual vs SalePrice
sns.barplot(x='BsmtQual',y='SalePrice',data=housing)
plt.show

## Conclusion:

We can see that RL (Residential Low Density) has the highest lot frontage and RM(Residential Medium Density) has the least

We can see that 2-STORY 1946 & NEWER has the highest lot frontage and PUD - MULTILEVEL INCL SPLIT LEV/FOYER has the least

The SalePrice is not showing much variance with respect to the Style of dwelling(one story/two story)

The SalePrice is almost same for all the Building Types(Type of dwelling) and the basement quality, so there is no significant pattern

In [None]:
# Calculating age of the property
housing["Age"] = housing["YrSold"] - housing["YearBuilt"]
housing["Age"].head()

In [None]:
# Dropping YrSold and YearBuilt
housing.drop(columns=["YearBuilt","YrSold"], axis=1, inplace =True)


In [None]:
housing.head()

## Correlation between Numerical Columns

In [None]:
plt.figure(figsize=[25,25])
sns.heatmap(housing.corr(numeric_only =True),annot=True,cmap= 'BuPu')
plt.title("Correlation of Numerical Values")


## Get top 10 correlated columns

In [None]:
k = 10
plt.figure(figsize=[15,15])
Cols = housing.corr(numeric_only=True).nlargest(k, "SalePrice").index
numeric_cols = housing[Cols].select_dtypes(include=[np.number])
cm = np.corrcoef(numeric_cols.values.T)
plt.figure(figsize=[15,15])
sns.heatmap(cm, annot=True, square=True, fmt='2f', cbar=True,
            annot_kws={'size': 10},
            yticklabels=numeric_cols.columns, xticklabels=numeric_cols.columns)


plt.show

## We can see that


##### GarageArea and GarageCars are highly correlated with coeff of 0.88

##### TotalBsmtSF and FirstfirSF are highly correlated with coeff of 0.82





##### GrLivArea and TotRmsAbvGrd are highly correlated with coeff of 0.83



In [None]:
cols = ["SalePrice","OverallQual","GarageArea","GarageCars","TotalBsmtSF","FullBath","Age"]
plt.figure(figsize =[20,20])
sns.pairplot(housing[cols])
plt.show()

##### we can see age has negative correlation with target variable SalePrice and TotalBsmtSF and GrLivArea have positive correlation with SalePrice

## step 4: Data Preparation

##### Dummy Encoding

In [None]:
housing_num = housing.select_dtypes(include=['int64','float64'])
housing_cat = housing.select_dtypes(include= 'object')

In [None]:
housing_cat

In [None]:
housing_cat_dm =pd.get_dummies(housing_cat, drop_first =True ,dtype =int)

In [None]:
housing_cat_dm

In [None]:
 house = pd.concat([housing_num, housing_cat_dm], axis = 1)


In [None]:
house.head()

In [None]:
house.shape

In [None]:
# split into target and feature variables
X =house.drop(["SalePrice"],axis = 1).copy()
y = house["SalePrice"].copy()

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train.shape

In [None]:
y_train.shape

## Scaling the dataset with standard Scalar

In [None]:
num_cols = list(X_train.select_dtypes(include=['int64','float64']).columns)

In [None]:
scaler = StandardScaler()
X_train[num_cols]= scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

### Building a function to calculate evaluation metrics

In [None]:
def eval_metrics(y_train, y_train_pred, y_test, y_pred):
    #r2 voules for train and test data
    print("r2 score (train) = ", '%.2f' % r2_score(y_train, y_train_pred))
    print("r2 score (test) = ", "%.2f" % r2_score(y_test, y_pred) )
    ##RMSE for train and test data
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_train = mse_train** 0.5 
    rmse_test = mse_test** 0.5

    print("RMSE(Train) = ", "%.2f" % rmse_train)
    print("RMSE(Test) = ", "%.2f" % rmse_test)


## Step 5: build ML Model

In [None]:
# import Ml Libs
import sklearn
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV



In [None]:
# Applying Ridge regressing with varing the hyperparameter 'Lambda'
params ={'alpha':
         [0.0001,0.0001,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10,20,50,100,500,1000]}
ridge = Ridge()
ridgeCV = GridSearchCV(estimator=ridge, param_grid=params, scoring='neg_mean_absolute_error' , cv = 5,
                       return_train_score=True, verbose=1, n_jobs=-1)
ridgeCV.fit(X_train, y_train)
        
         

In [None]:
ridgeCV.best_params_

In [None]:
ridgeCV.cv_results_

In [None]:
ridge = Ridge(alpha=9)


In [None]:
ridge.fit(X_train, y_train)

In [None]:
ridge.coef_

In [None]:
y_train_pred = ridge.predict(X_train)
y_pred = ridge.predict(X_test)


In [None]:
eval_metrics(y_train, y_train_pred, y_test, y_pred)

In [None]:
ridgeCV_res = pd.DataFrame(ridgeCV.cv_results_)
ridgeCV_res.head()

In [None]:
plt.plot(ridgeCV_res['param_alpha'],ridgeCV_res['mean_train_score'],label ='train')
plt.plot(ridgeCV_res['param_alpha'],ridgeCV_res['mean_test_score'],label ='test')
plt.xlabel('alpha')
plt.ylabel('R2_score')
plt.xscale('log')
plt.legend()
plt.show

In [None]:
# Applying lasso regressing with varing the hyperparameter 'Lambda'
lasso =Lasso()
lassoCV = GridSearchCV(estimator=lasso, param_grid=params, scoring='neg_mean_absolute_error' , cv = 5,
                       return_train_score=True, verbose=1, n_jobs=-1)
lassoCV.fit(X_train, y_train)

In [None]:
lassoCV.best_params_

In [None]:
lasso = Lasso(alpha=0.0001)


In [None]:
lasso.fit(X_train, y_train)

In [None]:
lasso.coef_

In [None]:
y_train_pred1 = lasso.predict(X_train)
y_pred1 = lasso.predict(X_test)


In [None]:
eval_metrics(y_train, y_train_pred, y_test, y_pred)

In [None]:
lassoCV_res = pd.DataFrame(lassoCV.cv_results_)
lassoCV_res.head()

In [None]:
plt.plot(lassoCV_res['param_alpha'],lassoCV_res['mean_train_score'],label ='train')
plt.plot(lassoCV_res['param_alpha'],lassoCV_res['mean_test_score'],label ='test')
plt.xlabel('alpha')
plt.ylabel('R2_score')
plt.xscale('log')
plt.legend()
plt.show

In [None]:
betas = pd.DataFrame(index=X.columns) # Convert the columns to a dataframe as betas
betas.rows = X.columns
#Creating columns for Ridge and Lasso coefficients against each feature
betas["Ridge"] = ridge.coef_ 
betas["Lasso"] = lasso.coef_
betas



In [None]:
# view the features removed by lassp
lasso_cols_removed = list(betas[betas['Lasso']==0].index)
print(lasso_cols_removed)

In [None]:
# view the features removed by lasso
lasso_cols_selected = list(betas[betas['Lasso']!=0].index)
print(lasso_cols_selected)

In [None]:
print(len(lasso_cols_removed))# 179 feature are removed by lasso
print(len(lasso_cols_selected))#107 feature are selectedby lasso    

### Top 10 feature significant in predicting the value of a house of a house, both according to Ridge model and lasso model

In [None]:
# view the top 10 coefficients of Ridge regression in descending order
betas['Ridge'].sort_values(ascending=False)[:10]

In [None]:
# we have to take inverse log of betas to interpret the ridge coefficients in terms of target variable
ridge_coeffs = np.exp(betas['Ridge'])
ridge_coeffs.sort_values(ascending=False)[:10]

In [None]:
# view the top 10 coefficients of lass in descending order
betas['Lasso'].sort_values(ascending=False)[:10]

In [None]:
# we have to take inverse log of betas to interpret the ridge coefficients in terms of target variable
lasso_coeffs = np.exp(betas['Lasso'])
lasso_coeffs.sort_values(ascending=False)[:10]
                      

In [None]:
Conclusion:

Below are the Top 10 features with corresponding coefficients according to Ridge model

-OverallQual_9           1.133067
-Neighborhood_StoneBr    1.098731
-OverallQual_8           1.088522
-Neighborhood_Crawfor    1.087434
-Exterior1st_BrkFace     1.086508
-Neighborhood_NridgHt    1.084547
-LandContour_HLS         1.074082
-CentralAir_Y            1.072557
-OverallCond_9           1.072201
-BsmtCond_TA             1.067943
Name: Ridge, dtype: float64

-Below are the Top 10 features with corresponding coefficients according to lasso model
-PoolQC_None             3.396703
-PoolArea                0.254439
-OverallQual_9           0.214585
-OverallQual_10          0.195207
-SaleCondition_Alloca    0.169205
-GarageCond_Po           0.163540
-SaleType_Oth            0.162588
-MSZoning_FV             0.155060
-OverallQual_8           0.147330
-OverallCond_9           0.135807
Name: Lasso, dtype: float64


Therefore the price of the house will increase by 1.11 with the increase in GrLivArea
The price of house can increase by 1.08 times if the finish of the house is Very Good
If the house has centralized AC the price can increase by 1.08 times
If the basement condition is typical then the house price may increase upto 1.06 times
The price of the house may increase if the neighborhood has Crawford, Stone Brook and Northridge Heights as Physical locations within Ames city limits

#Optimal value of lambda for Ridge Regression = 9
#Optimal value of lambda for Lasso = 0.001
