<a href="https://colab.research.google.com/github/SuhailAhmad1/Machine_learning/blob/main/Linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Linear** **Regression**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
np.random.seed(306)
plt.style.use('seaborn')

In [None]:
shuffle_split_cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

# Step 1: Load the datasets

In [None]:
features, label = fetch_california_housing(as_frame=True, return_X_y=True)

In [None]:
print("Shape of Features : ", features.shape)
print("Shape of Label : ", label.shape)

Shape of Features :  (20640, 8)
Shape of Label :  (20640,)


# Step 3: Preprocessing and Model Building

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, label, random_state=42)

In [None]:
print("Trainings Samples : ", X_train.shape[0])
print("Test Samples : ", X_test.shape[0])

Trainings Samples :  15480
Test Samples :  5160


In [None]:
X_train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
8158,4.2143,37.0,5.288235,0.973529,860.0,2.529412,33.81,-118.12
18368,5.3468,42.0,6.364322,1.08794,957.0,2.404523,37.16,-121.98
19197,3.9191,36.0,6.110063,1.059748,711.0,2.235849,38.45,-122.69
3746,6.3703,32.0,6.0,0.990196,1159.0,2.272549,34.16,-118.41
13073,2.3684,17.0,4.795858,1.035503,706.0,2.088757,38.57,-121.33


In [None]:
y_train.head()

8158     2.285
18368    2.799
19197    1.830
3746     4.658
13073    1.500
Name: MedHouseVal, dtype: float64

# 3.2: Pipleline and Preprocessing + Model

In [None]:
lin_reg_pipeline = Pipeline([("feature_scaling", StandardScaler()),
                             ("lin_reg", LinearRegression())])
lin_reg_pipeline.fit(X_train, y_train)


Pipeline(steps=[('feature_scaling', StandardScaler()),
                ('lin_reg', LinearRegression())])

In [None]:
print("Intercept (w_0) : ", lin_reg_pipeline[-1].intercept_)
print("Weight Vector : ", lin_reg_pipeline[-1].coef_)

Intercept (w_0) :  2.0703489205426377
Weight Vector :  [ 0.85210815  0.12065533 -0.30210555  0.34860575 -0.00164465 -0.04116356
 -0.89314697 -0.86784046]


# 4. Model Evaluation

4.1 **score**

In [None]:
test_score = lin_reg_pipeline.score(X_test, y_test)
print("Model performence on Test set : ",test_score)

train_score = lin_reg_pipeline.score(X_train, y_train)
print("Model performence on Training set : ",train_score)

Model performence on Test set :  0.5910509795491352
Model performence on Training set :  0.609873031052925


4.2 **Cross Validation**

In [None]:
lin_reg_score = cross_val_score(lin_reg_pipeline,
                                X_train,
                                y_train,
                                scoring='neg_mean_squared_error',
                                cv=shuffle_split_cv)

print(lin_reg_score)

[-0.50009976 -0.52183352 -0.55931218 -0.52110499 -0.56059203 -0.50510767
 -0.52386194 -0.54775518 -0.5007161  -0.54713448]


In [None]:
lin_reg_mse = -lin_reg_score
print("Mean squared error : ", lin_reg_mse.mean(), '+/-', lin_reg_mse.std())

Mean squared error :  0.5287517875396764 +/- 0.022232904569798696


# Practise Assignment Solutions

In [None]:
from sklearn.datasets import fetch_california_housing
import numpy as np
import pandas as pd

In [None]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

In [None]:
#Q1: Shape of fetaure matrix
X.shape

(20640, 8)

In [None]:
#Q2: Missing values
X.isnull().sum().sum()

0

In [None]:
#Q3: split into train_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=0)
X_test = np.array(X_test)
X_test[2][1]

28.0

In [None]:
#Q4: 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=0)
X_train["Population"]

0         322.0
1        2401.0
2         496.0
3         558.0
4         565.0
          ...  
16507    1605.0
16508     958.0
16509    1252.0
16510    3006.0
16511    1292.0
Name: Population, Length: 16512, dtype: float64

In [None]:
#Q4: Mean of traing output label
y_train.mean()

2.020670313105598

In [None]:
#Q5: Medain of the output label
y_test.median()

2.1475

In [None]:
#Q5: coefficients of our model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
lin_pipe = Pipeline([
                     ('standard_scaalr', StandardScaler()),
                     ('linear_regression', LinearRegression())])
lin_pipe.fit(X_train,y_train)
print("Coeficients are : ", lin_pipe[-1].coef_)

Coeficients are :  [ 0.83817967  0.12032096 -0.32135834  0.36456599 -0.00170674 -0.04524631
 -0.87994217 -0.82634126]


In [None]:
#Q6: bias term in the model
print("Bias Term : ", lin_pipe[-1].intercept_)

Bias Term :  2.020670313105614


# Graded Questions Solution

In [None]:
#Q1: r2 value

lin_pipe.score(X_test,y_test)

0.6605140591531992

In [None]:
#Q2: rmse error
from sklearn.metrics import mean_squared_error
print("root_mean_square_error : ", mean_squared_error(y_test, lin_pipe.predict(X_test))**(1/2))

root_mean_square_error :  0.7033383507521878


In [None]:
#Q3:6 on metrics

from sklearn.metrics import explained_variance_score
explained_variance_score(y_test, lin_pipe.predict(X_test))

0.6605500501742703

In [None]:
from sklearn.metrics import max_error
max_error(y_test, lin_pipe.predict(X_test))

7.260453292958372

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, lin_pipe.predict(X_test))

0.5168526993787042

In [None]:
#Q7:8 SGD regressor
from sklearn.linear_model import SGDRegressor
sgd_pipe = Pipeline([
                     ("Standard_Scaler", StandardScaler()),
                     ("sgd", SGDRegressor(random_state=0))])
sgd_pipe.fit(X_train,y_train)

Pipeline(steps=[('Standard_Scaler', StandardScaler()),
                ('sgd', SGDRegressor(random_state=0))])

In [None]:
sgd_pipe[-1].coef_

array([ 0.84046697,  0.112331  , -0.41213039,  0.21595971, -0.01781887,
       -0.01480892, -0.87394103, -0.83913104])

In [None]:
sgd_pipe[-1].intercept_[0]

2.011239208754174