## Day20 - Implementation of Multivariate Regression using LSS and scikit-learn

##### Self Learning Resource
 - Machine Learning - Multiple Linear Regression by Dr. Parteek Bhatia
 <a href="https://www.youtube.com/watch?v=GKHkbyo9S3k&t=43s"> Video</a>
 - Implementation of Multiple Linear Regression by Dr. Parteek Bhatia 
<a href="https://www.youtube.com/watch?v=5uW-hSxi9pU"> Video1</a>
<a href="https://www.youtube.com/watch?v=0zMTEUB_4qU"> Video2</a>
<a href="https://www.youtube.com/watch?v=d5RIuw9MA-U"> Video3</a>
<a href="https://www.youtube.com/watch?v=YlOmQ6T1guU"> Video4</a>
<a href="https://www.youtube.com/watch?v=tE90_VCNTgI"> Video5</a>


#  <span style='color:Red'>1.  Generate random set of weights (Chromosome)</span>

### 1.1 Generate random Chromosome

In [None]:
# To generate 10 chromosome of size 5

import random
import pandas as pd

for i in range(0, 10):
    chromosome = [ round(random.uniform(-1,1), 2) for j in range(0,5)]
    print(chromosome)

### 1.2 Dot product

In [None]:
x = pd.Series([1, 2, 3, 4])

y = pd.Series([5, 4, 3, 2])
#y = [5, 4, 3, 2]        # Also work

# 1*5 + 2*4 + 3*3 + 4*2 => 5 + 8 + 9 + 8 = 30
x.dot(y)


#  <span style='color:Red'>2. Multivariate regression using Least Sum of Square (LSS)</span>

### 2.1 Import library

In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 2.2 Read the dataset

In [None]:
df = pd.read_csv('data20.csv')
df.head()

### 2.3 Get the dimention of  the dataset

In [None]:
df.shape

### 2.4 Divide dataset into train and test

In [None]:
p = int(df.shape[0]*0.7)

df_train = df.iloc[:p]      # Also works --> df_train = df.loc[:p]
df_test = df.iloc[p:]       # Also Works --> df_test = df.loc[p:]

print("Train Data\n\n",df_train)
print("\n\nTest Data\n\n",df_test)

### 2.5 Parameter Setting

In [None]:
iterations      = 500                  # Number of Iterations
bestFitness     = 99999999             # Store Best Fitness Value
bestChromosome  = []                   # Store the set of best weight (Chromosome)
resultFileName  ="result-Random-weights-lss.csv"   # Store the best fitness value after every iteration
total           = df_train.shape[0]          # Total no of records(rows)
D               = df_train.shape[1] - 1      # Problem Dimension
LB              = -1                  # Set Size Lower Bound
UB              = 1     # Set Size Upper Bound

### 2.6 Fitness Function (LSS)

In [None]:
def FitnessFunction(x):       # x = chromosomes (set of weight)
    s=0
    for i in range(total):
        s = s + abs(df_train.iloc[i,0] - df_train.iloc[i,1:].dot(x))
    return round(s,2)

##### Example

In [None]:
i = 0
w = [0.42, 0.0, 0.7, 0.98, 0.98, -0.27]

print("Row =", list(df_train.iloc[i].values))
print("\nC = ", df_train.iloc[i,0])
print("\nF = ", list(df_train.iloc[i,1:].values))
print("\nx = ", w)

df_train.iloc[i,1:].dot(w)


### 2.7 Start main program

In [None]:
# Saving Result
fp = open(resultFileName,"w")
fp.write("Iteration,Fitness,w1,w2,w3,w4,w5,w6\n")

# Running till number of iterations
for i in range(0,iterations):
    
    # Generate Chromosome (random weights)
    chromosome = [ round(random.uniform(LB,UB),2) for j in range(0,D)]
    
    # Calculate the fitness of the chromosome
    fitness = FitnessFunction(chromosome)
    
    if fitness < bestFitness:
        bestFitness = fitness
        bestChromosome = chromosome
        
    if i%10==0:
        print("I:",i,"\t Fitness:", bestFitness)
        fp.write(str(i) + "," + str(bestFitness)+ "," + str(bestChromosome) + "\n")

print("I:",i,"\t Fitness:",bestFitness)
fp.write(str(i) + "," + str(bestFitness)+ "," + str(bestChromosome) + "\n")
fp.close()

print("Done")
print("\nBestFitness:", bestFitness)
print("Best chromosome:", bestChromosome)

### 2.8 Plot and save convergence graph

In [None]:
fp = pd.read_csv(resultFileName)
fp.tail()

In [None]:
x = fp.iloc[:,0]
y = fp.iloc[:,1]
plt.plot(x,y)
plt.xlabel('Iteration')
plt.ylabel('Fitness Value')
plt.title('Convergence Graph')
plt.savefig("result-convergence-plot-lss.jpg", dpi=300)
plt.show()

### 2.9 Make prediction on Test dataset

##### Prediction

In [None]:
predicted = df_test.iloc[:,1:].dot(bestChromosome)
print(predicted.shape)
predicted.head()

##### Extract actual

In [None]:
actual = df_test.iloc[:,0]
print(actual.shape)
actual.head()

### 2.10 Performance Metrics

In [None]:
from sklearn import metrics

print ("Mean Absolute Error     -->", metrics.mean_absolute_error(actual,predicted))
print ("Mean Squared Error      -->", metrics.mean_squared_error(actual,predicted))
print ("Root Mean Squared Error -->", np.sqrt(metrics.mean_squared_error(actual,predicted)))
print ("Correlation             -->", np.corrcoef(actual,predicted)[1,0])

### 2.11 Save actual/predicted to file

In [None]:
df_actual_predicted = pd.concat([actual, predicted], axis=1)

df_actual_predicted.to_csv("result-actual-predicted-lss.csv")

### 2.12 Plot and save scatter plot

In [None]:
plt.scatter(actual, predicted)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Actul Vs Predicted')
plt.savefig("result-scatter-plot-lss.jpg", dpi=300)
plt.show()

#  <span style='color:Red'>3. Multivariate regression using scikit-learn</span>

### 3.1 Import library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 3.2 Read the dataset

In [None]:
df = pd.read_csv('data20.csv')
df.head()

### 3.3 Get the dimention of  the dataset

In [None]:
df.shape

##### To Install package in notebook

In [None]:
# !pip install -U scikit-learn

In [None]:
# !conda install scikit-learn

### 3.4 Divide dataset into train and test

##### Split dataset into dependent (y) and independent (X) variables

In [None]:
X = df.iloc[:,1:].values
y = df.iloc[:,0].values

print("X.shape ->", X.shape)
print("y.shape ->", y.shape)

##### Split dataset into train and test

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

print("X_train.shape ->", X_train.shape)
print("X_test.shape  ->", X_test.shape)
print("y_train.shape ->", y_train.shape)
print("y_test.shape  ->", y_test.shape)


### 3.5 Train the linear model

In [None]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression().fit(X_train, y_train)
regressor

##### Get the coefficient (weights) of the regressor

In [None]:
regressor.coef_

### 3.6 Test the model

In [None]:
y_predict = regressor.predict(X_test)

### 3.7 Performance Metrics

In [None]:
from sklearn import metrics

print ("Mean Absolute Error     -->", metrics.mean_absolute_error(y_test,y_predict))
print ("Mean Squared Error      -->", metrics.mean_squared_error(y_test,y_predict))
print ("Root Mean Squared Error -->", np.sqrt(metrics.mean_squared_error(y_test,y_predict)))
print ("Correlation             -->", np.corrcoef(y_test,y_predict)[1,0])

### 3.8 Plot and save scatter plot

In [None]:
plt.scatter(y_test, y_predict)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Actul Vs Predicted')
plt.savefig("result-scatter-plot.jpg", dpi=300)
plt.show()

### 3.9 Save actual/predicted to file

In [None]:
df_actual_predicted = pd.concat([pd.Series(y_test), pd.Series(y_predict)], axis=1)
print(df_actual_predicted.head())
df_actual_predicted.to_csv("result-actual-predicted.csv")