### Import libraries:

In [1]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
from sklearn.metrics import plot_confusion_matrix

### Load data:

In [2]:
data = pd.read_csv('../../data/clean/data_cleaned.csv')

data.head()

Unnamed: 0,gender,ethnicity,parental_education,lunch,test_preparation_course,pass/fail_math,math_score,pass/fail_reading,reading_score,pass/fail_writing,writing_score,failed_courses,midterm_results,average_grade
0,M,A,high school,standard,yes,passed,67,passed,67,passed,63,good standing,good standing,65.67
1,F,D,no high school,reduced,no,failed,40,failed,59,failed,55,math/reading,summer school,51.33
2,M,E,high school,reduced,no,failed,59,passed,60,failed,50,math/writing,summer school,56.33
3,M,B,high school,standard,no,passed,77,passed,78,passed,68,good standing,good standing,74.33
4,M,E,associate,standard,yes,passed,78,passed,73,passed,68,good standing,good standing,73.0


### X/y and numerical/categorical split:

In [3]:
X = data.drop(['pass/fail_math', 'pass/fail_reading', 'pass/fail_writing', 'math_score', 
               'reading_score', 'writing_score', 'average_grade', 'midterm_results', 'failed_courses'], axis=1)
y = data['midterm_results']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#X_train_numerical = X_train.select_dtypes(np.number)
#X_test_numerical = X_test.select_dtypes(np.number)

X_train_categorical = X_train.select_dtypes(object)
X_test_categorical = X_test.select_dtypes(object)

### Encoding categoricals:

In [4]:
encoder = OneHotEncoder()
encoder.fit(X_train_categorical)

X_train_cat_enc = encoder.transform(X_train_categorical).toarray()
X_train_final = pd.DataFrame(X_train_cat_enc, columns=encoder.get_feature_names_out())

X_test_cat_enc = encoder.transform(X_test_categorical).toarray()
X_test_final = pd.DataFrame(X_test_cat_enc, columns=encoder.get_feature_names_out())

In [12]:
pickle.dump(encoder, open('One_Hot_Encoder.pkl', 'wb'))

### Logistic regression:

In [5]:
classification = LogisticRegression(random_state=42, solver='lbfgs',
                  multi_class='multinomial').fit(X_train_final, y_train)

### Predictions:

In [6]:
y_train_pred = classification.predict(X_train_final)
y_test_pred = classification.predict(X_test_final)

### Validating the model:

In [7]:
def model_performance_classification(y_train, y_train_pred, y_test, y_test_pred):

    accuracy_train = accuracy_score(y_train,y_train_pred)
    accuracy_test  = accuracy_score(y_test,y_test_pred)

    precision_train = precision_score(y_train,y_train_pred, pos_label='good standing')
    precision_test  = precision_score(y_test,y_test_pred, pos_label='good standing')

    recall_train = recall_score(y_train,y_train_pred, pos_label='good standing')
    recall_test  = recall_score(y_train,y_train_pred, pos_label='good standing')

    f1_train = f1_score(y_train,y_train_pred, pos_label='good standing')
    f1_test  = f1_score(y_train,y_train_pred, pos_label='good standing')
    
    kappa_train = cohen_kappa_score(y_train, y_train_pred)
    kappa_test = cohen_kappa_score(y_test, y_test_pred)

    performance = pd.DataFrame({'Error_metric': ['Accuracy score','Precision score','Recall score',
                                                 'F1 score', 'Kappa score'],
                            'Train': [accuracy_train, precision_train, recall_train, f1_train, kappa_train],
                            'Test' : [accuracy_test, precision_test, recall_test, f1_test, kappa_test]})

    pd.options.display.float_format = '{:.2f}'.format

    df_train = pd.DataFrame({'Real': y_train.tolist(), 'Predicted': y_train_pred.tolist()})
    df_test  = pd.DataFrame({'Real': y_test.tolist(),  'Predicted': y_test_pred.tolist()})

    return performance, df_train, df_test

In [8]:
model_performance_classification(y_train, y_train_pred, y_test, y_test_pred)

(      Error_metric  Train  Test
 0   Accuracy score   0.69  0.68
 1  Precision score   0.69  0.74
 2     Recall score   0.82  0.82
 3         F1 score   0.75  0.75
 4      Kappa score   0.35  0.31,
               Real      Predicted
 0    good standing  good standing
 1    summer school  good standing
 2    good standing  good standing
 3    summer school  summer school
 4    summer school  summer school
 ..             ...            ...
 695  good standing  good standing
 696  good standing  summer school
 697  good standing  good standing
 698  good standing  good standing
 699  good standing  good standing
 
 [700 rows x 2 columns],
               Real      Predicted
 0    good standing  good standing
 1    good standing  good standing
 2    summer school  good standing
 3    good standing  good standing
 4    good standing  summer school
 ..             ...            ...
 295  good standing  summer school
 296  good standing  good standing
 297  summer school  summer school
 298

### Storing predictions in the dataframe:

In [9]:
y_test_pred_df = pd.DataFrame(y_test_pred)

y_test_pred_df.head()

Unnamed: 0,0
0,good standing
1,good standing
2,good standing
3,good standing
4,summer school


In [10]:
df = data.copy()

df['midterm_results_pred_logistic'] = y_test_pred_df

df.dropna(inplace=True)

df.to_csv('logistic_regression_pred.csv', index_label = False)

### Saving the model:

In [11]:
pickle.dump(classification, open('logistic_regression.pkl', 'wb'))