# Multivariate Linear Regression

* Linear Regression with more than one feature 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import datasets

#Dataset
#https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt
    
# Load the diabetes dataset
diabetes = datasets.load_diabetes()

#Create Pandas Dataframe from the diabetes data
diabetes_df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

#Add the Target or Output variable and name it progreession
diabetes_df['progression'] = diabetes.target

#Create a dataframe for the input variable(X) from bmi
X = pd.DataFrame(diabetes_df[['age','bp','bmi']])

#Y is the output variable
Y = diabetes_df['progression']

#Split the data into Test and Train sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)


# Create linear regression object
lr_model = linear_model.LinearRegression()

# Train the model using the training sets
lr_model.fit(X_train, Y_train)

# Make predictions using the testing set
Y_pred = lr_model.predict(X_test)


In [None]:
# The coefficients
print('Coefficients: \n', lr_model.coef_)


#The Intercept
print('Intercept', lr_model.intercept_)
# The mean squared error
print("Root Mean squared error: %.2f"
      % np.sqrt(mean_squared_error(Y_test, Y_pred)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(Y_test, Y_pred))

print('Score : ', lr_model.score(X_test, Y_test))

In [None]:
#Create a DataFrame to compate actual vs Predicted values
#compare_df = pd.DataFrame(columns=['Actual Value', 'Predicted Value'])
compare_df = Y_test.to_frame(name='Actual Value').reset_index()

compare_df['Predicted Value'] = pd.DataFrame(Y_pred, columns=['Predicted Value'])

print(compare_df)

# Logistic Regression

* Regression technique to predict categorical variables ( Classification )
* Example : Predict if an email is spam or not
* Logistic regression outputs a value between 0 to 1
* This is done using a sigmoid function
* Sigmoid function is applied to the linear function
* Linear Regression : Y = B0 + B1X
* Z = sigmoid(Y)

* S(z)=1/1+e^z

* s(z) = output between 0 and 1 (probability estimate)
* z = input to the function (your algorithm’s prediction e.g. mx + b)
* e = base of natural log

### Sigmoid Function
![Sigmoid Function](https://raw.githubusercontent.com/soulzcore/iacc_python_ML_2019/master/week5/img/sigmoid.png)


------------------

### Linear Vs Logistic Regression

![Sigmoid Function](https://raw.githubusercontent.com/soulzcore/iacc_python_ML_2019/master/week5/img/linvslogreg.png)

* Cost Function : Cross Entropy or Log Loss is the cost function genereally applied 
* Minimize Cost Function : Gradient Descent is used to minimize the cost function

## Types of Logistic Regression
1. Binary Logistic Regression
The categorical response has only two 2 possible outcomes. Example: Spam or Not
2. Multinomial Logistic Regression
Three or more categories without ordering. Example: Predicting which food is preferred more (Veg, Non-Veg, Vegan)
3. Ordinal Logistic Regression
Three or more categories with ordering. Example: Movie rating from 1 to 5


## Evaluating the performance Logistic Regression model


### True Positives (TP): These are cases in which we predicted the actual positive clss( predicted  spam  Email accurately ).
### True Negatives (TN): Predicted actual negative class( Predicted non-spam email accurately )
### False Positives (FP): Predicted spam for a not spam EMail
### False Negatives (FN): Predicted not spam for a spam email

## Confusion Matrix

* Confusion Matric is a table used to visualize the performance of a classification algorithm
* In a confusion matrix, TP, TN, FP, FN are displayed in a grid
* Confusion matrix is used to calculate other metrics such as Accuracy, Precision, recall, F1-Score etc..

![Confusion Matrix](https://raw.githubusercontent.com/soulzcore/iacc_python_ML_2019/master/week5/img/cm.jpg)



## Precision

* Total Percentage of True positives

# Recall/Sensitivity

* How often are predicted positives True Positives 

# F1 Score

* Weighted average of recall and precision


# Example: Predict gender of a person based on their diabetes chart using Logistic Regression

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,roc_curve,auc,log_loss
from sklearn import datasets

#Dataset
#https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt

    
# Load the diabetes dataset
diabetes = datasets.load_diabetes()

#Create Pandas Dataframe from the diabetes data
diabetes_df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
print(diabetes_df.head())

#Add the Target or Output variable and name it progreession
diabetes_df['progression'] = diabetes.target
print(diabetes_df.head())

#Create a dataframe for the input variable(X) from age, bp, bmi, pregression
X = pd.DataFrame(diabetes_df[['age','bp','bmi','progression']])

#Encode male, female as 0 or 1
diabetes_df['sex'] = diabetes_df['sex'].apply(lambda x: 1 if x > 0 else 0)

#Y is the output variable
Y = diabetes_df['sex']

#Split the data into Test and Train sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)

#Create the Logistic Regression object
lr = LogisticRegression()

#Train or fit the model
lr.fit(X_train,Y_train)

#Generate the prediction score on the test Data set
print('Score :', lr.score(X_test,Y_test))

#Generate the Confusion Matrix
print('Confusion Matrix :', confusion_matrix(Y_test,pred))
print(classification_report(Y_test,pred))


# Algorithms Chart(Scikit-Learn)

![Available Algorithms](https://raw.githubusercontent.com/soulzcore/iacc_python_ML_2019/master/week5/img/sklearnalg.png)


------------------

# Support Vector Machines

* Can be used for classification and regression
* Mainly used for Classification
* Goal is to find the optimal plane or hyperplane that separates distinct classes
* If the number of input features is 2, then the hyperplane is just a line. If the number of input features is 3, then the hyperplane becomes a two-dimensional plane


![SVM](https://raw.githubusercontent.com/soulzcore/iacc_python_ML_2019/master/week5/img/svm1.png)
 
* Data points on each side of the hyperplane belong to a specific class
 
* Support vectors are data points that are closest to the Hyperplane
 
* Hyperplanes are built based on the positioning of the support vectors
 
* In the SVM algorithm, we are looking to maximize the margin between the data points and the hyperplane. The loss function that helps maximize the margin is hinge loss.
 
 
 ![Non Linear](https://raw.githubusercontent.com/soulzcore/iacc_python_ML_2019/master/week5/img/svmhighdim.png)

* For Non Linear data transformatons are used to cast the datset into a higher dimensional plane. These transformations are called kernels.
 
* Cost Function : Hinge loss
 
* Loss Minimization : Gradient Descent or Stochastic Gradient Descent
 

# Example: Predict gender of a person based on their diabetes chart using SVM
 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,roc_curve,auc,log_loss
from sklearn.svm import SVC
from sklearn import datasets

#Dataset
#https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt

    
# Load the diabetes dataset
diabetes = datasets.load_diabetes()

#Create Pandas Dataframe from the diabetes data
diabetes_df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

#Add the Target or Output variable and name it progreession
diabetes_df['progression'] = diabetes.target


#Create a dataframe for the input variable(X) from bmi
X = pd.DataFrame(diabetes_df[['age','bp','bmi','progression']])

diabetes_df['sex'] = diabetes_df['sex'].apply(lambda x: 1 if x > 0 else 0)

print(diabetes_df.head())

#Y is the output variable
Y = diabetes_df['sex']

print(diabetes_df.head())
#Split the data into Test and Train sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)

#Create the SVM model object
svm_model = SVC()

#Train or fit the model
svm_model.fit(X_train,Y_train)


#Generate precitions for the test datset
pred = svm_model.predict(X_test)

#Generate prediction score
print('Score :', svm_model.score(X_test,Y_test))
print('Confusion Matrix :', confusion_matrix(Y_test,pred))
print(classification_report(Y_test,pred))


# Decision Trees

* Used for both Classification and Regression
* Supervised learning Algorithm
* Goal is to algorithmically split data based on different conditions 
* Widely used
* Easy to interpret and explain

![Decisiomn Trees](https://raw.githubusercontent.com/soulzcore/iacc_python_ML_2019/master/week5/img/dt1.png)
* Learns rules based on input features
* The deeper the tree the more complex the rules become
* Consits of nodes, edges and leaves 
* Node - Represents a test or condition
* Edge - Represents the outcome 
* Branch - Represents the class or decision
* Can be applied to non-linear data 
* A general algorithm for a decision tree can be described as follows:

* Step 1 : Pick the best attribute/feature. The best attribute is one which best splits or separates the data.
* Generate the split condition
* Apply Step 1 to the resulting dataset
* Continue until the decision is reached

## How do decision tree decide on the splits

* The Decision Tree algorithms decide how to split the data based on a certain criteria
* The two common criteria are : Information Gain(Entopy) or Gini index

![Gini Index](https://raw.githubusercontent.com/soulzcore/iacc_python_ML_2019/master/week5/img/dtgini.png)

### Pruning

* Pruning is used to avoid overfitting
* Pruning is the technique of reducing the depth of the tree so it doesnt become too complex and eliminates nodes which add less value
* Requires manually working with different depths 


In [None]:
!pip install -U scikit-learn

# Example: Predict gender of a person based on their diabetes chart using Decision Tree

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import tree
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz, plot_tree


#Dataset
#https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt

    
# Load the diabetes dataset
diabetes = datasets.load_diabetes()

#Create Pandas Dataframe from the diabetes data
diabetes_df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

#Add the Target or Output variable and name it progreession
diabetes_df['progression'] = diabetes.target


#Create a dataframe for the input variable(X) from bmi
X = pd.DataFrame(diabetes_df[['age','bp','bmi','progression']])

#Encode sex as 0 or 1 for male and female
diabetes_df['sex'] = diabetes_df['sex'].apply(lambda x: 1 if x > 0 else 0)


#Y is the output variable
Y = diabetes_df['sex']

#Split the data into Test and Train sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)

#Create the Decision Tree Model
dt_model = tree.DecisionTreeClassifier()

#Train or Fit the model
dt_model.fit(X_train,Y_train)

#Generate precitions on the test data 
pred = dt_model.predict(X_test)

#Calculate the prediction score
print(dt_model.score(X_test,Y_test))


#Plot the Tree
plt.figure()
tree.plot_tree(dt_model, filled=True)
plt.show()



# Random Forests

* Random Forests is an algorithm which considers multiple decision trees before making the final decision
* The collection of trees used for prediction is called a forest
* Each decision tree in the forest is built using random subset of features, thats why they are called random forests
* For Regression problems, when it comes time to make a prediction, the random forest takes an average of all the individual decision tree estimates
* For Classification,the random forest will take a majority vote for the predicted class


![Random Forests](https://raw.githubusercontent.com/soulzcore/iacc_python_ML_2019/master/week5/img/randomforests.jpeg)

# Example: Predict gender of a person based on their diabetes chart using Random Forests

In [None]:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import tree
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


#Dataset
#https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt

    
# Load the diabetes dataset
diabetes = datasets.load_diabetes()

#Create Pandas Dataframe from the diabetes data
diabetes_df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

#Add the Target or Output variable and name it progreession
diabetes_df['progression'] = diabetes.target


#Create a dataframe for the input variable(X) from bmi
X = pd.DataFrame(diabetes_df[['age','bp','bmi','progression']])

diabetes_df['sex'] = diabetes_df['sex'].apply(lambda x: 1 if x > 0 else 0)


#Y is the output variable
Y = diabetes_df['sex']

#Split the data into Test and Train sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)

#Create the Random Forests Model Object, max trees = 100, max_depth=5
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5)

#Train or Fit the model
rf_model.fit(X_train,Y_train)

#Generate predictions on test data
pred = rf_model.predict(X_test)

#Calculate Score
print(rf_model.score(X_test,Y_test))


# K Nearest Neigbors

* Can be used for Classification and Regression
* Mostly used for Classification
* The algorithm makes the assumption that data points close to each other are similar
* In simple terms, the algorithm calculates the k nearest neighbours and gets the most common class for classification or the average for regression
* High Level steps followed by the algorithm:
   *  Iterate from 1 to total number of training data points
   * Calculate the distance between test data and each row of training data. Euclidean distance is the most popular method. The other metrics that can be used are Chebyshev, cosine, etc.
   * Sort the calculated distances in ascending order based on distance values
   * Get top k rows from the sorted array
   * Get the most frequent class of these rows
   * Return the predicted class


![KNN](https://raw.githubusercontent.com/soulzcore/iacc_python_ML_2019/master/week5/img/knn.webp)



# Example: Predict gender of a person based on their diabetes chart using KNN

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


#Dataset
#https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt

    
# Load the diabetes dataset
diabetes = datasets.load_diabetes()

#Create Pandas Dataframe from the diabetes data
diabetes_df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

#Add the Target or Output variable and name it progreession
diabetes_df['progression'] = diabetes.target


#Create a dataframe for the input variable(X) from bmi
X = pd.DataFrame(diabetes_df[['age','bp','bmi','progression']])

diabetes_df['sex'] = diabetes_df['sex'].apply(lambda x: 1 if x > 0 else 0)


#Y is the output variable
Y = diabetes_df['sex']

#Split the data into Test and Train sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)

#Create the KNN object, with k = 3
knn_model = KNeighborsClassifier(n_neighbors=3)

#Train and fit the model
knn_model.fit(X_train,Y_train)

#Generate pridictions on the test dataset
pred = knn_model.predict(X_test)

#Calculate the model score
print(knn_model.score(X_test,Y_test))

# K - Means

* Un-Supervised Learning technique
* Used for clustering data into different groups or cluster
* Example : Recommendation Engines
* The center of the cluster is called centroid
* K means divides the data into k clusters
* The goal of K-Means algorithm is to minimize the sum of distances between the points and their respective cluster centroid.

* Clustering process
* Step 1 : Select the number of clusters to find in the data (k)
* Step 2 : Select k random data points as the initial centroids
* Step 3 : Assign every point to the cluster closest to it
* Step 4 : Calculate the centroid of the clusters that are now created
* Repeat Step 3 and 4 until one of the following conditions is met 
    * The centroids do not change
    * The points stay in the same clusters
    * The max number of iterations is reached



![KMeans](https://raw.githubusercontent.com/soulzcore/iacc_python_ML_2019/master/week5/img/kmeans.gif)



# Example: Find 2 Clusters in the diabetes data


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

#Dataset
#https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt

    
# Load the diabetes dataset
diabetes = datasets.load_diabetes()

#Create Pandas Dataframe from the diabetes data
diabetes_df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

#Add the Target or Output variable and name it progreession
diabetes_df['progression'] = diabetes.target


#Create a dataframe for the input variable(X) from bmi
X = pd.DataFrame(diabetes_df[['age','progression']])

#Create the KMeans object,with k = 2
km_model = KMeans(n_clusters=2, random_state=0)

#Fit the model
km_model.fit(X)

#Print the cluster centroids
print(km_model.cluster_centers_)

#Plot the clusters
plt.figure('K-means with 3 clusters')
plt.scatter(X.values[:, 0], X.values[:, 1], c=km_model.labels_)
plt.show()


# Other Algorithms