# Multiple Linear Regression

## Importing the libraries

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [19]:
# dataset = pd.read_csv("crop_yield_prediction.csv")
# dataset = pd.read_csv("module_2_data.csv")
dataset = pd.read_csv("module_2_more_than_mean_data.csv")
dataset.head()

Unnamed: 0,State_Name,District_Name,Season,Crop,Area,Production
0,Maharashtra,AHMEDNAGAR,Kharif,Chick Peas,40800.0,18600.0
1,Maharashtra,AHMEDNAGAR,Kharif,Sugarcane,45900.0,38940.0
2,Maharashtra,AHMEDNAGAR,Rabi,Wheat,79700.0,87100.0
3,Maharashtra,AHMEDNAGAR,Kharif,Sugarcane,59600.0,5231800.0
4,Maharashtra,AHMEDNAGAR,Rabi,Chick Peas,59600.0,40900.0


In [20]:
dataset.dropna(inplace=True)

In [21]:
dataset.dtypes

State_Name        object
District_Name     object
Season            object
Crop              object
Area             float64
Production       float64
dtype: object

## Feature Scaling

In [22]:
from sklearn import preprocessing


std_scale = preprocessing.StandardScaler().fit(dataset[['Production']])
dataset.Production = std_scale.transform(dataset[['Production']])
dataset.Area = std_scale.transform(dataset[['Area']])

In [23]:
from sklearn.preprocessing import LabelEncoder
  
le = LabelEncoder()
  
dataset['Crop']= le.fit_transform(dataset['Crop'])
dataset['District_Name']= le.fit_transform(dataset['District_Name'])
dataset['Season']= le.fit_transform(dataset['Season'])


In [24]:
dataset.head()

Unnamed: 0,State_Name,District_Name,Season,Crop,Area,Production
0,Maharashtra,0,0,2,-0.25148,-0.264222
1,Maharashtra,0,0,12,-0.248553,-0.252548
2,Maharashtra,0,1,13,-0.229153,-0.224906
3,Maharashtra,0,0,12,-0.24069,2.727949
4,Maharashtra,0,1,2,-0.24069,-0.251423


In [25]:
X = dataset.iloc[:,1:-1].values
y = dataset.iloc[:,-1].values

## Splitting the dataset into the Training set and test set

In [26]:
from sklearn.model_selection import  train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training the Multiple Linear Regression model on the Training set

In [27]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting the Test set results

In [28]:
y_pred = regressor.predict(X_test)

np.set_printoptions(precision=2)

print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), 1))

[[ 4.99e-01 -2.49e-01]
 [-1.06e-02 -1.69e-01]
 [-1.33e-01 -2.66e-01]
 [-2.92e-01 -2.70e-01]
 [-4.22e-01 -2.45e-01]
 [-1.62e-01 -2.66e-01]
 [ 6.53e-01 -1.51e-01]
 [-1.32e-01 -2.35e-01]
 [-4.77e-01 -2.02e-01]
 [ 8.68e-02 -2.48e-01]
 [-6.21e-01 -2.64e-01]
 [-3.89e-01 -2.39e-01]
 [-4.58e-01 -2.51e-01]
 [-1.15e-01 -2.38e-01]
 [-5.73e-01 -2.62e-01]
 [ 4.77e-02  4.22e-03]
 [ 1.86e-01 -1.77e-01]
 [-3.32e-01 -2.67e-01]
 [ 4.82e-01 -2.53e-01]
 [ 6.54e-02 -2.13e-01]
 [-8.73e-02 -1.20e-01]
 [ 1.00e+00  1.51e+00]
 [ 4.37e-01 -2.59e-01]
 [-7.36e-02 -1.06e-01]
 [-1.49e-01 -2.66e-01]
 [-1.80e-01 -2.51e-01]
 [ 4.24e-01 -2.30e-01]
 [ 5.47e-01 -2.46e-01]
 [-1.84e-02 -2.63e-01]
 [ 1.16e+00  4.56e+00]
 [-4.55e-01 -2.48e-01]
 [-8.77e-02 -2.50e-01]
 [ 6.46e-01  1.66e-01]
 [-1.82e-01 -2.43e-01]
 [ 4.18e-01 -2.44e-01]
 [-2.05e-01 -1.83e-01]
 [-3.72e-02 -2.62e-01]
 [ 1.97e-01 -1.85e-01]
 [-1.60e-01 -2.59e-01]
 [-2.34e-01  8.85e-02]
 [-4.33e-01 -2.45e-01]
 [ 4.82e-03 -1.13e-01]
 [-4.42e-01 -2.24e-01]
 [-4.25e-01

## Model Evaluation using R squared

In [29]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.235112842946809

## Mean Square Error and Root MSE

In [30]:
from sklearn.metrics import mean_squared_error
import math
print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))

0.9418427939920496
0.9704858546068817


## Mean Absolute Error(MAE)

In [31]:
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

0.43595378951447294


## Intercept and coefficient

In [32]:
print(regressor.intercept_)

0.4509840082064409


In [33]:
print(regressor.coef_)

[0.01 0.56 0.05 5.05]


## Stochastic Gradient Descent

In [34]:
from sklearn.linear_model import SGDRegressor

sgdr = SGDRegressor()
#print(sgdr)
sgdr.fit(X_train, y_train)


SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [35]:
y_pred = sgdr.predict(X_test)

# from sklearn.metrics import r2_score
# r2_score(y_test, y_pred)

score = sgdr.score(X_train, y_train)
print("R-squared:", score)


R-squared: -0.3803084507333192


In [36]:
sgdr.coef_

array([0.05, 0.51, 0.04, 0.43])

In [37]:
sgdr.intercept_

array([-0.56])

In [38]:
np.set_printoptions(precision=2)

print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), 1))

[[ 1.22e+00 -2.49e-01]
 [ 9.14e-01 -1.69e-01]
 [ 5.15e-01 -2.66e-01]
 [ 4.66e-01 -2.70e-01]
 [-2.75e-02 -2.45e-01]
 [ 1.20e+00 -2.66e-01]
 [ 3.75e-01 -1.51e-01]
 [ 6.40e-01 -2.35e-01]
 [-2.13e-01 -2.02e-01]
 [ 1.39e+00 -2.48e-01]
 [ 1.37e-01 -2.64e-01]
 [ 9.27e-01 -2.39e-01]
 [ 4.02e-01 -2.51e-01]
 [ 1.00e+00 -2.38e-01]
 [ 8.41e-01 -2.62e-01]
 [-3.50e-01  4.22e-03]
 [ 1.17e+00 -1.77e-01]
 [ 3.72e-01 -2.67e-01]
 [ 1.13e+00 -2.53e-01]
 [ 2.04e-01 -2.13e-01]
 [ 9.19e-02 -1.20e-01]
 [ 1.51e+00  1.51e+00]
 [ 1.04e+00 -2.59e-01]
 [-2.24e-01 -1.06e-01]
 [ 5.69e-02 -2.66e-01]
 [ 1.21e-02 -2.51e-01]
 [ 4.92e-01 -2.30e-01]
 [ 1.82e+00 -2.46e-01]
 [ 1.34e+00 -2.63e-01]
 [ 1.48e+00  4.56e+00]
 [ 3.92e-01 -2.48e-01]
 [ 6.55e-01 -2.50e-01]
 [ 1.10e-01  1.66e-01]
 [ 5.11e-01 -2.43e-01]
 [ 8.99e-01 -2.44e-01]
 [ 8.98e-01 -1.83e-01]
 [ 1.11e+00 -2.62e-01]
 [ 2.61e-01 -1.85e-01]
 [ 7.39e-01 -2.59e-01]
 [-4.19e-01  8.85e-02]
 [ 7.22e-01 -2.45e-01]
 [ 4.26e-01 -1.13e-01]
 [ 9.23e-01 -2.24e-01]
 [ 7.68e-01

In [39]:
from sklearn.metrics import mean_squared_error
import math
print(mean_squared_error(y_test, y_pred))
print(math.sqrt(mean_squared_error(y_test, y_pred)))

1.3787614371421009
1.1742067267487872
