### **1. Importing the required packages**

In [None]:
#packages for data reading and manipulation
import pandas as pd
import numpy as np

#machine learning related packages
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

### **2. Reading and Exploring the Data**

In [None]:
data = pd.read_csv('Boston.csv')

In [None]:
data.head() #print the top 5 rows of the data for a quick look

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [None]:
data.shape  #print the number of rows and columns in the data

(506, 14)

#### **Steps to be performed while Data Exploration**

1. Null values
2. Duplicate values
3. Data types in each column
4. Outliers present in the data
5. Necessary visualizations

In [None]:
data.isnull().sum() #print the total number of missing values column-wise

Unnamed: 0,0
crim,0
zn,0
indus,0
chas,0
nox,0
rm,0
age,0
dis,0
rad,0
tax,0


In [None]:
data.duplicated().sum() #tell the number of duplicate rows present in the data

np.int64(0)

In [None]:
data[data.duplicated()] #print all the duplicate rows from the dataframe

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv


In [None]:
data.drop_duplicates(inplace = True)  #drop/remove all the duplicate rows

In [None]:
data.dtypes #print the datatype of the values present in each column

Unnamed: 0,0
crim,float64
zn,float64
indus,float64
chas,int64
nox,float64
rm,float64
age,float64
dis,float64
rad,int64
tax,int64


##### **Check and remove outliers**

In [None]:
data.describe() #show the statistical summary of the data

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


### **Machine Learning Process**

1. Create X and y variables to store the input and output cols.
2. Split the data into training and testing sets.
3. Standardization/Scaling of the data.
4. Apply the suitable machine learning algorithms on the training data to learn the patterns.
5. Test the performance/accuracy of the model using the testing data before we use our model in the real world.

In [None]:
X = data.drop(columns = 'medv')
y = data['medv']

In [None]:
#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

#### **Apply Linear Regression on the data**

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
lin_reg.intercept_  #c value

np.float64(29.771279589488955)

In [None]:
lin_reg.coef_   #m1 to m13 values

array([-5.15879683e-02,  2.92045603e-02,  4.04012577e-02,  1.11204432e+00,
       -1.76322078e+01,  4.69543503e+00, -5.84568500e-03, -1.39724448e+00,
        2.38323699e-01, -1.29074979e-02, -9.60852404e-01,  1.12543409e-02,
       -4.32698847e-01])

In [None]:
y_pred = lin_reg.predict(X_test)

In [None]:
y_pred

array([17.69696771, 19.01686297, 12.90028099, 12.00336183, 20.22250831,
       21.15398555, 15.45021394, 21.53636151, 36.50924354, 14.05924098,
       30.46618738, 18.42183135, 25.67004171, 20.06480708, 20.1179504 ,
       30.61983904, 21.46566808, 23.31427289, 29.43853094, 35.19229766,
       40.55930958, 19.92127037, 21.16829586, 23.3948522 , 21.21746192,
       19.65528778, 16.64294476, 32.44777906, 14.28865435, 25.63910056,
       22.7137175 , 25.43420271, 20.21648241, 28.83484843, 21.28359686,
       15.2285237 , 12.49371931, 24.62094294, 22.12675636,  2.76647724,
       17.63115771, 25.29575347, 16.82794815, 26.19875499, 13.04019609,
        9.41756625, 18.79713764, 22.71100311, 20.82890766, 30.68493189,
       20.52734329, 33.16917367, 10.11059746, 17.23056221, 43.0225402 ,
       10.18541828, 20.99508022, 20.38025839, 15.77787434, 24.51051083,
       26.96165012, 12.31143467, 39.42209024, 18.28213913, 25.59100974,
       17.27777233, 12.53438513,  8.6730527 , 22.38671924, 25.38

In [None]:
y_test

Unnamed: 0,medv
21,19.6
380,10.4
354,18.2
30,12.7
315,16.2
...,...
466,19.0
49,19.4
389,11.5
199,34.9


In [None]:
r2_score(y_test, y_pred)

0.662855480251003

#### **Performance metrics in Regression Models**

1. R2 Score
2. Adjusted R2 Score
3. Mean squared error (MSE)
4. Root Mean squared Error (RMSE)

In [None]:
mse = mean_squared_error(y_test, y_pred)

In [None]:
mse

29.02625975558973

In [None]:
rmse = np.sqrt(mse)

In [None]:
rmse

np.float64(5.387602412538413)

If we predict the price of any house, then the predicted price will be different from the actual price by an average of 5.38