In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from sklearn.datasets import load_boston

In [None]:
boston=load_boston()

In [None]:
boston.keys()

In [None]:
print(boston.feature_names)

Preparing the Dataset

In [None]:
df=pd.DataFrame(boston.data,columns=boston.feature_names)
df['Price']=boston.target
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

Exploratory Data Analysis

In [None]:
#Finding Correlation
df.corr()
#Negative Correlation b/w two features shows that if one feature increases then the other decreases

In [None]:
plt.scatter(df['CRIM'],df['Price'])
plt.xlabel("CRIME RATE")
plt.ylabel("PRICE")

In [None]:
plt.scatter(df['CHAS'],df['Price'])
plt.xlabel("CHAS")
plt.ylabel("PRICE")

In [None]:
plt.scatter(df['LSTAT'],df['Price'])
plt.xlabel("LSTAT")
plt.ylabel("PRICE")

In [None]:
X=df.drop(['Price'],axis=1)
y=df['Price']

In [None]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
#Noramlizing/Standardizing the Dataset
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [None]:
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test) 
# we use transform in test data as using fit_transform changes the training parameters and become bias towards test data

In [None]:
print(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression()
reg.fit(X_train,y_train)

In [None]:
#printing the coefficients and intercept
print(reg.coef_)
print(reg.intercept_)

In [None]:
#prediction
y_pred=reg.predict(X_test)

In [None]:
plt.scatter(y_test,y_pred)

In [None]:
#Residuals(Errors)
residuals=y_test-y_pred
residuals

In [None]:
sns.displot(residuals,kind='kde') # if our model is predicting good,we should get a normal distribution

In [None]:
plt.scatter(y_pred,residuals) #we should get a uniform density

USING METRICS

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
print(mean_absolute_error(y_test,y_pred))
print(mean_squared_error(y_test,y_pred))
print(np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
#R square and Adjusted R square
from sklearn.metrics import r2_score
r2score=r2_score(y_test,y_pred)
print(r2score) # closer to 1 is better

#Adjusted R square = 1-[(1-R^2)*(n-1)/(n-k-1)]
n=len(y_test) # no. of obseravations
k=X_test.shape[1] # no. of predictor variable(features)
ar2score=1-((1-r2score)*(n-1)/(n-k-1)) # should be less than r2 score
print(ar2score)

PICKLING THE MODEL

In [None]:
import pickle
pickle.dump(reg,open('reg.pkl','wb'))

In [None]:
pickled_model=pickle.load(open('reg.pkl','rb')) #this loads the pickled model

In [None]:
point=sc.transform(boston.data[0].reshape(1,-1))

In [None]:
pickled_model.predict(point)