**<h1><center>Linear Regression</center></h1>**

In [None]:
from sklearn.datasets import load_boston

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from pprint import pprint

import pandas as pd

import seaborn as sns

import numpy as np

import warnings
warnings.filterwarnings('ignore')

regre_data = load_boston()
print(regre_data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [None]:
# Training Data
data = pd.DataFrame(regre_data.data,columns = regre_data.feature_names)

# Testing Data
data['target'] = regre_data.target

pprint(data)

        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0    0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1    0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2    0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3    0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4    0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
501  0.06263   0.0  11.93   0.0  0.573  6.593  69.1  2.4786  1.0  273.0   
502  0.04527   0.0  11.93   0.0  0.573  6.120  76.7  2.2875  1.0  273.0   
503  0.06076   0.0  11.93   0.0  0.573  6.976  91.0  2.1675  1.0  273.0   
504  0.10959   0.0  11.93   0.0  0.573  6.794  89.3  2.3889  1.0  273.0   
505  0.04741   0.0  11.93   0.0  0.573  6.030  80.8  2.5050  1.0  273.0   

     PTRATIO       B  LSTAT  target  
0       15.3  396.90   4.98    24.0  
1       17.8  396.90   

In [None]:
corr = data.corr()

In [None]:
print (corr['target'].sort_values(ascending=False)[:5], '\n') #top 15 values
print ('----------------------')
print (corr['target'].sort_values(ascending=False)[-5:]) #last 5 values`

target    1.000000
RM        0.695360
ZN        0.360445
B         0.333461
DIS       0.249929
Name: target, dtype: float64 

----------------------
NOX       -0.427321
TAX       -0.468536
INDUS     -0.483725
PTRATIO   -0.507787
LSTAT     -0.737663
Name: target, dtype: float64


In [None]:
X = data.loc[:,['RM','ZN','B','DIS','NOX','TAX','INDUS','PTRATIO','LSTAT']]
y = data['target']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

#Creating the Model
model = LinearRegression(normalize = True)

# Training the model
model.fit(X,y)

print("Coef:", model.coef_)
print("Intercept:", model.intercept_)

Coef: [ 4.16568567e+00  3.81914214e-02  9.67218515e-03 -1.44176599e+00
 -1.44794612e+01 -8.75222778e-04 -2.61859652e-02 -8.83957403e-01
 -5.47483026e-01]
Intercept: 29.862601969146652


In [None]:
y_predict = model.predict(X_test)


print("Mean Absolute Error is ",mean_absolute_error(y_test, y_predict))

print("\nRoot Mean Squared Error is",
      np.sqrt(mean_squared_error(y_test, y_predict)))

print("\nr2 score is {}".format(r2_score(y_test,y_predict) * 100))

Mean Absolute Error is  3.9495076839120387

Root Mean Squared Error is 5.722660135109558

r2 score is 68.68559345849258


**Cross-Validation**

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10)

k_fold_score = []
for train_index, test_index in kf.split(X,y):
    
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]

    y_train, y_test = y[list(train_index)], y[test_index]
    
    model = LinearRegression(normalize = True)
    model.fit(X_train,y_train)
    y_predict = model.predict(X_test)
    k_fold_score.append(r2_score(y_test,y_predict))

k_fold_score

[0.7268081599511311,
 0.45095362694915664,
 -0.6928472861903945,
 0.6553660579059806,
 0.6137655533968605,
 0.735708488532449,
 0.2978303007145643,
 -0.04574742308878621,
 -0.566149925654913,
 0.38873968830209626]

In [None]:
np.mean(k_fold_score)

0.2564427240818145

Mean of r2 score is 0.256 this is not good value. For any type of problems regression or classification we can not tell which algorithm will give better results, we need to check with all algorithms and we can choose best algorithms which gives good results.