<a href="https://colab.research.google.com/github/SankaraSubramanian94/SankaraSubramanian94/blob/main/Logistic_Regression_Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Import Pandas, Pyplot and Read data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import seaborn as sns

%matplotlib inline

pd.set_option('display.max_colwidth', None)

sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

# CSV is first read in from a github raw file another option is to import the notebook to your session storage by click on the file icon on left toolbar then importing csv
! wget https://raw.githubusercontent.com/DLPY/Classification_Session_2/main/Student2020.csv

In [None]:
# Once we have the csv file pd.read_csv() converts it to a pandas dataframe
df = pd.read_csv('Student2020.csv')

### 2. Investigating the Data and exploration

In [None]:
df.head(5)

In [None]:
# check types, nulls and counts
df.info()

In [None]:
# investigate the variability of the numeric columns
df.describe()

In [None]:
df.Pass.value_counts()
sns.countplot(x='Pass', data=df, palette='hls')
plt.show()

In [None]:
Fail = len(df[df['Pass']=='Fail'])
Pass = len(df[df['Pass']=='Pass'])
pct_of_fail = Fail/(Fail+Pass)
print("percentage of fail is", pct_of_fail*100)
pct_of_pass = Pass/(Fail+Pass)
print("percentage of pass", pct_of_pass*100)

In [None]:
# Checking various null entries in the dataset, with the help of heatmap
sns.heatmap(df.isnull(), cbar=False)

### 3. Split the data to prepare training and testing sets

In [None]:
# Independent Variable
X = df.drop(['Pass'], axis=1).values

# Depenedent Variable
y = df.Pass.values

# Split Observations in 80% training set 20% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
print('Training Data:', X_train.shape, y_train.shape)
print('Testing Data:', X_test.shape, y_test.shape)

### 4. Train the model

In [None]:
# instantiate the model (using the default parameters)
logreg = LogisticRegression(solver='lbfgs', max_iter=300)

# fit the model with data
logreg.fit(X_train,y_train)

In [None]:
# Coefficient and Intercept
print(logreg.coef_)
print(logreg.intercept_)

In [None]:
# Create dataframe from regressor coefficient to display results in a dataframe
column_names = ['age', 'Auditory', 'Kinaesthetic', 'Visual', 'ExtrinsicMotivation', 'IntrinsicMotivation', 'SelfEfficacy', 'StudyTime', 'Conscientiousness', 'CAOpoints', 'Maths', 'English']
coefficient_df = pd.DataFrame(logreg.coef_) # T - Transpose dataframe rows to columns
coefficient_df.columns = column_names
coefficient_df

### 5. Predict the test values

In [None]:
# Predict test set from model built during training 
y_pred = logreg.predict(X_test)

### 6. Evaluate the model using Confusion Matrix, Accuracy, Precision, Recall, F1-Score and ROC Curve

#### i) Confusion Matrix



In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

In [None]:
fig, ax = plt.subplots()
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
ax.xaxis.set_ticklabels(['Fail', 'Pass']); ax.yaxis.set_ticklabels(['Fail', 'Pass']);

#### ii) classification report -  Accuracy, Precision, Recall, F1-Score

In [None]:
classificationReport = pd.DataFrame(classification_report(y_test,y_pred,output_dict=True)).T
classificationReport

#### iii) ROC Curve

In [None]:
# convert string to int
y_test_int = pd.DataFrame(y_test, columns= ['Pass_Fail'])
y_test_int = y_test_int['Pass_Fail'].apply(lambda x : 0 if x == 'Fail' else 1 )

y_pred_int = pd.DataFrame(y_pred, columns= ['Pass_Fail'])
y_pred_int = y_pred_int['Pass_Fail'].apply(lambda x : 0 if x == 'Fail' else 1 )

In [None]:
logit_roc_auc = metrics.roc_auc_score(y_test_int, y_pred_int)
fpr, tpr, thresholds = metrics.roc_curve(y_test_int,  y_pred_int)
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()