# Logistic Regression

Logistic regression is a statistical method for predicting binary outcomes from data.

Examples of this are "yes" vs. "no" or "young" vs. "old". 

These are categories that translate to a probability of being a 0 or a 1 

We can calculate the logistic regression by applying an activation function as the final step to our linear model. 
It is a special case of linear regression as it predicts the probabilities of outcome using log function. We use the activation function (sigmoid) to convert the outcome into categorical value.

This converts the linear regression output to a probability.

In [None]:
# Add dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
sns.set()  #  Will import Seaborn functionalities
# we don't like warnings
# you can comment the following 2 lines if you'd like to
import warnings
warnings.filterwarnings('ignore')

# Data Understanding EDA

In [None]:
df =pd.read_csv('Resources/diabetes.csv')
df.tail()


In [None]:
type(df)

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.describe().transpose()

In [None]:
df['BloodPressure'].unique()

In [None]:
df['Pregnancies'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df['Outcome'].value_counts()

In [None]:
df['Outcome'].value_counts(normalize=True)


# Data Visualization

In [None]:
df['Outcome'].hist()

In [None]:
df.hist(figsize=(20,15))

In [None]:
# Histogram and density graphs of all variables were accessed.
fig, ax = plt.subplots(4,2, figsize=(16,16))
sns.distplot(df.Age, bins = 20, ax=ax[0,0]) 
sns.distplot(df.Pregnancies, bins = 20, ax=ax[0,1]) 
sns.distplot(df.Glucose, bins = 20, ax=ax[1,0]) 
sns.distplot(df.BloodPressure, bins = 20, ax=ax[1,1]) 
sns.distplot(df.SkinThickness, bins = 20, ax=ax[2,0])
sns.distplot(df.Insulin, bins = 20, ax=ax[2,1])
sns.distplot(df.DiabetesPedigreeFunction, bins = 20, ax=ax[3,0]) 
sns.distplot(df.BMI, bins = 20, ax=ax[3,1]) 

In [None]:
fig, ax = plt.subplots(figsize = (15, 10))
sns.boxplot(data = df, width = 0.5, ax = ax, fliersize = 3)
plt.show()

In [None]:
# Access to the correlation of the data set was provided. What kind of relationship is examined between the variables. 
# If the correlation value is> 0, there is a positive correlation. While the value of one variable increases, the value of the other variable also increases.
# Correlation = 0 means no correlation.
# If the correlation is <0, there is a negative correlation. While one variable increases, the other variable decreases. 
# When the correlations are examined, there are 2 variables that act as a positive correlation to the Salary dependent variable.
# These variables are Glucose. As these increase, Outcome variable increases.

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(12,10))
# seaborn has an easy method to showcase heatmap
p = sns.heatmap(df.corr(), annot=True,cmap ='RdYlGn')

# Data Preprocessing

## Missing Values

Impute the missing values "mean" for numerical columns
and for categorical columns with its "mode"

In [None]:
# replace 0 with NAN
import numpy as np
df_copy = df.copy()
df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
df_copy.head()

In [None]:
print(df_copy.isnull().sum())

In [None]:
#pip install missingno

In [None]:
import missingno as msno
msno.bar(df_copy);

In [None]:
df_copy['Insulin'].fillna(df_copy['Insulin'].mean(), inplace=True)
df_copy['Glucose'].fillna(df_copy['Glucose'].mean(), inplace=True)
df_copy['BloodPressure'].fillna(df_copy['BloodPressure'].mean(), inplace=True)
df_copy['SkinThickness'].fillna(df_copy['SkinThickness'].mean(), inplace=True)
df_copy['BMI'].fillna(df_copy['BMI'].mean(), inplace=True)

In [None]:
df_copy.head()

In [None]:
msno.bar(df_copy);

In [None]:
df_copy.isnull().sum()

In [None]:
# Histogram and density graphs of all variables were accessed.
fig, ax = plt.subplots(4,2, figsize=(16,16))
sns.distplot(df_copy.Age, bins = 20, ax=ax[0,0]) 
sns.distplot(df_copy.Pregnancies, bins = 20, ax=ax[0,1]) 
sns.distplot(df_copy.Glucose, bins = 20, ax=ax[1,0]) 
sns.distplot(df_copy.BloodPressure, bins = 20, ax=ax[1,1]) 
sns.distplot(df_copy.SkinThickness, bins = 20, ax=ax[2,0])
sns.distplot(df_copy.Insulin, bins = 20, ax=ax[2,1])
sns.distplot(df_copy.DiabetesPedigreeFunction, bins = 20, ax=ax[3,0]) 
sns.distplot(df_copy.BMI, bins = 20, ax=ax[3,1]) 

## Outliers

In [None]:
# fig, ax = plt.subplots(figsize = (15, 10))
# sns.boxplot(data = df_copy, width = 0.5, ax = ax, fliersize = 3)
# plt.show()

In [None]:
# outLier_df= df_copy.copy()
# Q1 = outLier_df.Insulin.quantile(0.25)
# Q3 = outLier_df.Insulin.quantile(0.75)
# IQR = Q3-Q1
# print(round(IQR,2))


In [None]:
# lower = Q1 - 1.5*IQR
# upper = Q3 + 1.5*IQR
# print(round(lower,2))
# print(round(upper,2))

In [None]:

# outLier_df= outLier_df[~((outLier_df['Insulin']< lower) |(outLier_df['Insulin']> upper))]

In [None]:
# import seaborn as sns
# sns.boxplot(x = outLier_df["Insulin"]);

# Machine Learning Models

<!-- 1 - Need to identify X and y/
2 - Split our data using the Train-Test-Split Method (80% vs 20%)
3 - Import the model (library)
4 - Initiate the model engine - by simply calling the model/function
5 - Train the model on the 80% (X and y) using fit() method
6 - Test the model on the 20% (X only) using the predict() method
7 - We compare the Predicted outcome with the Actual outcome
8 - We make the decision -->

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Import a Random Forests classifier
from sklearn.ensemble import RandomForestClassifier
# Import an Extremely Random Trees classifier
from sklearn.ensemble import ExtraTreesClassifier

# X, y 

In [None]:
# Drop the diabetes values and set the X to the remaining data.
df = df_copy.copy()
X = df.drop("Outcome", axis=1)
X.head()

In [None]:
# Set the y variable to the "Outcome" column.
y = df["Outcome"]

In [None]:
target_names = ["negative", "positive"]

## Split our data into training and testing data

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Dummy Classifier

## Feature scaling

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic Regression

In [None]:
# Create a logistic regression model, fit (train) the model, and validate the model.
classifier = LogisticRegression(max_iter=10000)
classifier.fit(X_train_scaled, y_train)
y_pred = classifier.predict(X_test_scaled)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

## Random Forests

In [None]:
# Fit a model, and then print a classification report
clf = RandomForestClassifier(random_state=1)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

## ExtraTreesClassifier

In [None]:
clf = ExtraTreesClassifier(random_state=1)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

## Adoptive Boosting classfier

In [None]:
clf = AdaBoostClassifier(random_state=1).fit(X_train_scaled, y_train)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

## K Neighbors Classifier (KNN)

In [None]:
classifier = LogisticRegression(max_iter=10000)
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

## Support Vector Classifier

## Decision Tree Classifier

# confusion_matrix

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) 
print(f"Accuracy: {accuracy}")

In [None]:
print(classification_report(y_true, y_pred))