#### LINEAR REGRESSION

In [2]:
import pandas as pd
import numpy as np 

In [3]:
boston = pd.read_csv("BostonHousing.csv")

In [4]:
boston.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [None]:
# Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

In [6]:
boston.shape

(506, 14)

In [7]:
boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       501 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [8]:
boston = boston.dropna()

In [9]:
boston.isna().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

In [10]:
boston.shape

(501, 14)

#### MACHINE LEARNING BEGINS

In [11]:
features = boston.iloc[  :  , 0 : -1] # all the rows and all the columns excpet the final column
features

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48


In [13]:
target = boston.iloc[  :  , -1 ] # all the rows and the final column
target.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: medv, dtype: float64

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(features,target)

In [17]:
x_train.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
436,14.4208,0.0,18.1,0,0.74,6.461,93.3,2.0026,24,666,20.2,27.49,18.05
260,0.54011,20.0,3.97,0,0.647,7.203,81.8,2.1121,5,264,13.0,392.8,9.59
253,0.36894,22.0,5.86,0,0.431,8.259,8.4,8.9067,7,330,19.1,396.9,3.54
491,0.10574,0.0,27.74,0,0.609,5.983,98.8,1.8681,4,711,20.1,390.11,18.07
38,0.17505,0.0,5.96,0,0.499,5.966,30.2,3.8473,5,279,19.2,393.43,10.13


In [18]:
x_test.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
393,8.64476,0.0,18.1,0,0.693,6.193,92.6,1.7912,24,666,20.2,396.9,15.17
453,8.24809,0.0,18.1,0,0.713,7.393,99.3,2.4527,24,666,20.2,375.87,16.74
360,4.54192,0.0,18.1,0,0.77,6.398,88.0,2.5182,24,666,20.2,374.56,7.79
214,0.28955,0.0,10.59,0,0.489,5.412,9.8,3.5875,4,277,18.6,348.93,29.55
27,0.95577,0.0,8.14,0,0.538,6.047,88.8,4.4534,4,307,21.0,306.38,17.28


In [19]:
y_train.head()

436     9.6
260    33.8
253    42.8
491    13.6
38     24.7
Name: medv, dtype: float64

In [20]:
y_test.head()

393    13.8
453    17.8
360    25.0
214    23.7
27     14.8
Name: medv, dtype: float64

In [21]:
print(boston.shape)
print(features.shape)
print(target.shape) 
print(x_train.shape) # TRAINING DATASET FROM FEATURES
print(x_test.shape)  # TESTING DATASET FROM FEATURES
print(y_train.shape) # TRAINING DATASET FROM TARGET
print(y_test.shape)  # TESTING DATASET FROM TARGET

(501, 14)
(501, 13)
(501,)
(375, 13)
(126, 13)
(375,)
(126,)


In [22]:
from sklearn.linear_model import LinearRegression

In [23]:
my_lr_model = LinearRegression()

In [24]:
my_lr_model.fit(x_train,y_train)

In [25]:
my_lr_model.coef_

array([-1.26612475e-01,  4.88303682e-02,  1.18388468e-02,  1.81904711e+00,
       -1.58678027e+01,  4.05611568e+00,  2.74981007e-03, -1.58888002e+00,
        3.07369258e-01, -1.23716837e-02, -9.25301586e-01,  8.67500310e-03,
       -5.51432371e-01])

In [26]:
my_lr_model.intercept_

34.46060876493335

In [27]:
Sales_generated = my_lr_model.coef_ * 50 + my_lr_model.intercept_
Sales_generated

array([  28.12998501,   36.90212717,   35.0525511 ,  125.41296412,
       -758.92952419,  237.2663929 ,   34.59809927,  -44.98339212,
         49.82907166,   33.84202458,  -11.80447054,   34.89435892,
          6.88899021])

In [28]:
y_preds = my_lr_model.predict(x_test)
y_preds

array([20.63623364, 23.15563523, 23.4733937 , 10.39255858, 14.72381606,
       14.26304134, 14.14739681, 23.6496808 , 20.93369728, 20.04607748,
       20.2075719 , 38.42305953, 10.09902404, 28.57310267, 27.11132507,
       27.38031553, 14.42438857, 15.1520288 ,  6.61813501, 19.87571442,
       22.70243798,  8.14475523, 31.10940931, 28.6909701 , 19.12348915,
       25.06084671, 13.42638369, 27.92777194, 21.95675466, 26.10828731,
       17.67205212, 16.75774842, 24.5088748 , 25.22955914, 23.76152524,
       24.28204809, 19.41698711, 11.43615563, 13.77255516, 20.19943488,
       31.46316233, -0.06282161, 12.6211195 , 16.84841012, 25.95349567,
       13.20522182, 13.62337052, 25.65749568,  2.74056577, 16.27211411,
       15.42515762, 21.24451399, 10.74824129, 34.6838357 , 18.68879922,
       14.43431482, 18.93695684, 33.71490968, 24.95599195, 18.49926335,
       13.80810894, 24.12005816, 23.83711407, 23.02578096, 20.17011829,
        5.38186364, 20.84343622, 20.5610884 , 24.32993919, 15.73

In [29]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [30]:
print(mean_squared_error(y_test,y_preds)) # LINEAR REGRESSION

20.249424039313656


In [31]:
print(mean_absolute_error(y_test,y_preds)) # LINEAR REGRESSION

3.352024284441651
