In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


: 

## Description of the dataset

In [None]:
df = pd.read_csv("HousingData.csv")
df

In [None]:
df.info()  # the data has null values >> all of the columns containing null values has float datatype thus imputing them by median

In [None]:
df.describe()  ## summarising the statsitic

In [None]:
df.isnull().sum()

In [None]:
df['CRIM'] = df['CRIM'].fillna(df['CRIM'].median())
df['ZN'] = df['ZN'].fillna(df['ZN'].median())
df['INDUS'] = df['INDUS'].fillna(df['INDUS'].median())
df['CHAS'] = df['CHAS'].fillna(df['CHAS'].median())
df['AGE'] = df['AGE'].fillna(df['AGE'].median())
df['LSTAT'] = df['LSTAT'].fillna(df['LSTAT'].median())

In [None]:
df.isnull().sum()

### Exploratory Data Analysis

In [None]:
df.corr()

### Analysing the correlated features and performing Univariate analysis

In [None]:
plt.scatter(df.CRIM, df.MEDV)
plt.title("Scatter plot")
plt.xlabel("Crime Rate")
plt.ylabel("Price")
plt.show()

## negatively correlated (does make sense)

In [None]:
sns.regplot(x='RM', y ='MEDV', data=df)   # posivitily correlated 

In [None]:
plt.scatter(df.LSTAT, df.MEDV)
plt.title("Scatter plot")
plt.xlabel("Crime Rate")
plt.ylabel("Price")
plt.show()

# the features are negatively related >> since high value of LSTAT lower socio-economic status and thus the price of houses decreases 

In [None]:
sns.regplot(x='TAX', y ='MEDV', data=df)   ## the feature seems negativily correlated,  which first seems wierd but sometimes 
#High Taxes in Lower-Value Areas may exist in older or economically burdened municipalities.These towns may raise
#taxes to maintain public services despite a shrinking tax base.

### Multivariate Analysis

In [None]:
sns.pairplot(df)

### lets seperate X, y (independent and dependent columns) 

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [None]:
X

In [None]:
y

### lets seperate trianing and testing data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 82)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

### lets scale down the data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

In [None]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled

## Model Training

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
reg = LinearRegression()

In [None]:
reg.fit(X_train_scaled,y_train)

In [None]:
y_pred = reg.predict(X_test_scaled)
y_pred

## Some assumptions 

In [None]:
## plot a scatter plot for the prediction
plt.scatter(y_test,y_pred)

## the plot shows that y_test and y_pred data has a positive linear relationship, also there are some outliers present

In [None]:
## Residuals/ errors

residuals= y_test-y_pred
residuals

In [None]:
## Plot this residuals 

sns.displot(residuals,kind="kde")
# the residuals are mostly normaly distributed between -10 to 10 (yes there are some outliers values)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
mean_squared_error(y_test, y_pred)

In [None]:
r2_score(y_test,y_pred) ## the r2 score is quite decent

### hyperparameter tuning

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
alpha_values = [0.01, 0.1, 1, 5, 10, 100]


ridge_params = {'alpha' : alpha_values}

lasso_params = {'alpha' : alpha_values}

elastic_params = {'alpha' : alpha_values,
                 'l1_ratio' : [0.1, 0.5, 0.9]}

In [None]:
ridge = Ridge()
ridge_grid = GridSearchCV(ridge, ridge_params, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train_scaled, y_train)

lasso = Lasso()
lasso_grid = GridSearchCV(lasso, lasso_params, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train_scaled, y_train)

elastic = ElasticNet()
elastic_grid = GridSearchCV(elastic, elastic_params, cv=5, scoring='neg_mean_squared_error')
elastic_grid.fit(X_train_scaled, y_train)


In [None]:
# Ridge Results
ridge_best = ridge_grid.best_estimator_
ridge_pred = ridge_best.predict(X_test_scaled)
print("Ridge - Best alpha:", ridge_grid.best_params_['alpha'])
print("Ridge - Test MSE:", mean_squared_error(y_test, ridge_pred))
print("Ridge - r2 score:", r2_score(y_test, ridge_pred))

# Lasso Results
lasso_best = lasso_grid.best_estimator_
lasso_pred = lasso_best.predict(X_test_scaled)
print("Lasso - Best alpha:", lasso_grid.best_params_['alpha'])
print("Lasso - Test MSE:", mean_squared_error(y_test, lasso_pred))
print("Lasso- r2 score:", r2_score(y_test, lasso_pred))


# ElasticNet Results
elastic_best = elastic_grid.best_estimator_
elastic_pred = elastic_best.predict(X_test_scaled)
print("ElasticNet - Best params:", elastic_grid.best_params_)
print("ElasticNet - Test MSE:", mean_squared_error(y_test, elastic_pred))
print("ElasticNet- r2 score:", r2_score(y_test, elastic_pred))



In [None]:
## after using hyperparameter tuning these are the results, (*there is not much changes but still it seems Ridge is performing the best with the highest accuracy of 77.8 percent) 
## also I will be using the linear model for prediction(just for simplification), because 0.1 percent increated accuracy is almost negligible
## feel free to use any model you like

## Prediction on the new Data

In [None]:
X.sample(1, random_state=42).values

In [None]:
##transformation of new data
scaler.transform(X.sample(1, random_state=42).values)

In [None]:
reg.predict(scaler.transform(X.sample(1, random_state=42).values))

## Pickling the Model for Deployment

In [None]:
import pickle

In [None]:
pickle.dump(reg,open('regmodel.pkl','wb'))

In [None]:
pickled_model=pickle.load(open('regmodel.pkl','rb'))

In [None]:
pickled_model.predict(scaler.transform(X.sample(1, random_state=42).values))