In [128]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

df = pd.read_csv('/Users/p.devine/Documents/GitHub/Academic-Success-Predictor/data/df_vif_cleaned.csv')

X = df.drop(columns=['result_pass'])

y = df['result_pass'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression(max_iter=1000)

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.6769
Confusion Matrix:
[[30 27]
 [15 58]]
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.53      0.59        57
           1       0.68      0.79      0.73        73

    accuracy                           0.68       130
   macro avg       0.67      0.66      0.66       130
weighted avg       0.68      0.68      0.67       130



In [129]:
# Train a logistic regression model with L1 regularization
model_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
model_l1.fit(X_train, y_train)

# Print feature importance
importance = np.abs(model_l1.coef_).flatten()
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
print(feature_importance.sort_values(by='Importance', ascending=False))

#L1 (lasso) Regularisation to see which features matters most

              Feature  Importance
2            failures    1.733863
3           schoolsup    1.181316
22       Mjob_teacher    1.080087
19        Mjob_health    1.019083
14          school_MS    0.884254
20         Mjob_other    0.861034
21      Mjob_services    0.843064
6          activities    0.495580
1           studytime    0.449650
30    guardian_mother    0.419408
15              sex_M    0.408617
10               Dalc    0.319021
16          address_U    0.310482
27        reason_home    0.253401
28       reason_other    0.249062
25      Fjob_services    0.190222
8            internet    0.164368
5                paid    0.148761
17        famsize_LE3    0.131896
26       Fjob_teacher    0.125307
0          traveltime    0.099197
12             health    0.082294
13           absences    0.076128
11               Walc    0.038006
9            romantic    0.030481
29  reason_reputation    0.028234
7             nursery    0.021545
18          Pstatus_T    0.000000
23        Fjob

In [130]:
df2 = df.drop(columns = ['guardian_other', 'famsup', 'Fjob_other', 'Fjob_health', 'Pstatus_T', 'nursery', 'reason_reputation', 'romantic', 'Walc'])

X = df2.drop(columns=['result_pass'])

y = df2['result_pass'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logreg = LogisticRegression(max_iter=1000)

logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 0.6923
Confusion Matrix:
[[30 27]
 [13 60]]
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.53      0.60        57
           1       0.69      0.82      0.75        73

    accuracy                           0.69       130
   macro avg       0.69      0.67      0.68       130
weighted avg       0.69      0.69      0.68       130



In [131]:
print(df["result_pass"].value_counts())

result_pass
1    348
0    301
Name: count, dtype: int64


In [132]:
from imblearn.over_sampling import SMOTE

X = df2.drop(columns=['result_pass'])
y = df2['result_pass']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

model = LogisticRegression(max_iter=1000)

model.fit(X_train_smote, y_train_smote)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7153846153846154
Confusion Matrix:
 [[34 23]
 [14 59]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.60      0.65        57
           1       0.72      0.81      0.76        73

    accuracy                           0.72       130
   macro avg       0.71      0.70      0.70       130
weighted avg       0.71      0.72      0.71       130



In [135]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define hyperparameter grid
param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    "solver": ["liblinear", "lbfgs", "saga"],  # Different solvers for optimization
}

# Initialize logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Perform Grid Search with Cross-Validation (CV=5)
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train_smote, y_train_smote)

# Best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)

print(f"Best Parameters: {best_params}")
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("Classification Report:\n", classification_report(y_test, y_pred_best))


Best Parameters: {'C': 1, 'solver': 'lbfgs'}
Accuracy: 0.7153846153846154
Confusion Matrix:
 [[34 23]
 [14 59]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.60      0.65        57
           1       0.72      0.81      0.76        73

    accuracy                           0.72       130
   macro avg       0.71      0.70      0.70       130
weighted avg       0.71      0.72      0.71       130



In [136]:

misclassified = X_test[y_test != y_pred_best]
misclassified['true_label'] = y_test[y_test != y_pred_best]
misclassified['predicted_label'] = y_pred_best[y_test != y_pred_best]

# Analyze common features in misclassified instances
print(misclassified.describe())
print(misclassified.groupby('true_label').mean())

       traveltime  studytime   failures  schoolsup       paid  activities  \
count   37.000000  37.000000  37.000000  37.000000  37.000000   37.000000   
mean     1.540541   2.081081   0.054054   0.162162   0.054054    0.351351   
std      0.730091   0.924313   0.229243   0.373684   0.229243    0.483978   
min      1.000000   1.000000   0.000000   0.000000   0.000000    0.000000   
25%      1.000000   1.000000   0.000000   0.000000   0.000000    0.000000   
50%      1.000000   2.000000   0.000000   0.000000   0.000000    0.000000   
75%      2.000000   3.000000   0.000000   0.000000   0.000000    1.000000   
max      4.000000   4.000000   1.000000   1.000000   1.000000    1.000000   

        internet       Dalc     health   absences  ...  Mjob_other  \
count  37.000000  37.000000  37.000000  37.000000  ...   37.000000   
mean    0.756757   1.459459   3.702703   3.621622  ...    0.405405   
std     0.434959   0.730091   1.488192   5.386977  ...    0.497743   
min     0.000000   1.00000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misclassified['true_label'] = y_test[y_test != y_pred_best]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  misclassified['predicted_label'] = y_pred_best[y_test != y_pred_best]


In [137]:
df2['studytime_failures'] = df2['studytime'] * df2['failures']
df2['paid_schoolsup'] = df2['paid'] * df2['schoolsup']
df2['health_absences'] = df2['health'] * df2['absences']
df2['absences_studytime'] = df2['absences'] * df2['studytime']
df2['Dalc_health'] = df2['Dalc'] * df2['health']
df2['traveltime_schoolsup'] = df2['traveltime'] * df2['schoolsup']

# Step 2: Define features (X) and target variable (y)
X = df2.drop(columns=['result_pass'])
y = df2['result_pass']

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Step 5: Train a Logistic Regression model on the SMOTE-resampled data
model = LogisticRegression(max_iter=1000)
model.fit(X_train_smote, y_train_smote)

# Step 6: Predict on the test set
y_pred = model.predict(X_test)

# Step 7: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7
Confusion Matrix:
 [[34 23]
 [16 57]]
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.60      0.64        57
           1       0.71      0.78      0.75        73

    accuracy                           0.70       130
   macro avg       0.70      0.69      0.69       130
weighted avg       0.70      0.70      0.70       130



In [138]:
# Train a logistic regression model with L1 regularization
model_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
model_l1.fit(X_train_smote, y_train_smote)

# Print feature importance
importance = np.abs(model_l1.coef_).flatten()
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
print(feature_importance.sort_values(by='Importance', ascending=False))

#L1 (lasso) Regularisation to see which features matters most

                 Feature  Importance
3              schoolsup    1.474435
17          Mjob_teacher    1.248444
14           Mjob_health    1.213139
16         Mjob_services    1.084370
15            Mjob_other    0.982913
2               failures    0.980470
10             school_MS    0.815256
1              studytime    0.635010
5             activities    0.579877
23    studytime_failures    0.437514
11                 sex_M    0.380247
12             address_U    0.341059
22       guardian_mother    0.334945
20           reason_home    0.301534
28  traveltime_schoolsup    0.284330
4                   paid    0.237781
19          Fjob_teacher    0.202535
7                   Dalc    0.199556
21          reason_other    0.173775
0             traveltime    0.154850
18         Fjob_services    0.146586
13           famsize_LE3    0.080895
8                 health    0.073853
26    absences_studytime    0.050723
27           Dalc_health    0.050595
24        paid_schoolsup    0.021382
2

In [139]:
# Step 2: Define features (X) and target variable (y)
X = df2.drop(columns=['result_pass', 'absences', 'internet'])
y = df2['result_pass']

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Step 5: Train a Logistic Regression model on the SMOTE-resampled data
model = LogisticRegression(max_iter=1000)
model.fit(X_train_smote, y_train_smote)

# Step 6: Predict on the test set
y_pred = model.predict(X_test)

# Step 7: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7
Confusion Matrix:
 [[32 25]
 [14 59]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.56      0.62        57
           1       0.70      0.81      0.75        73

    accuracy                           0.70       130
   macro avg       0.70      0.68      0.69       130
weighted avg       0.70      0.70      0.69       130



In [140]:
from sklearn.model_selection import KFold, cross_val_score

classifier = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)

kfold = KFold(10, shuffle = True, random_state =42)
for i, (train_index, test_index) in enumerate(kfold.split(X,y)):
    print(f"Fold {i}:")
    print(f" Train: index={train_index}")
    print(f" Test: index={test_index}")

scores = cross_val_score(classifier, X ,y, cv=kfold, scoring = 'accuracy')

average_acc = np.mean(scores)

print(f"Accuracy Score for each fold: {[round(score,4) for score in scores]}")
print(f"Average accuracy across 10 folds: {average_acc: .2f}")


Fold 0:
 Train: index=[  0   1   3   4   5   6   7   8   9  11  12  13  14  15  16  17  18  19
  20  21  22  23  24  25  26  27  28  29  32  33  34  35  36  37  38  39
  40  41  42  43  45  46  47  48  49  50  51  52  53  56  57  58  59  60
  61  62  64  65  66  67  68  70  71  73  74  75  77  79  80  82  83  84
  85  86  87  88  89  91  92  93  94  95  96  97  98  99 100 102 103 104
 105 106 107 108 109 111 112 113 114 115 116 117 118 119 120 121 122 123
 124 125 126 127 128 129 130 132 133 134 135 136 137 138 139 140 141 142
 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 159 160 161
 162 163 164 166 167 168 169 170 171 172 173 175 178 179 180 182 183 184
 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
 203 204 205 206 207 208 211 212 213 214 215 216 217 218 219 221 222 223
 224 225 226 227 228 229 230 232 233 234 235 236 237 238 239 240 241 242
 243 244 245 246 248 250 251 252 253 254 255 256 257 258 259 260 261 262
 263 265 266 267 268 269 270 

In [141]:

from sklearn.model_selection import StratifiedKFold

# Step 1: Define features (X) and target variable (y)
X = df2.drop(columns=['result_pass', 'absences', 'internet'])
y = df2['result_pass']

# Step 2: Initialize the SMOTE object
smote = SMOTE(random_state=42)

# Step 3: Initialize the Stratified K-Fold Cross-Validator (ensures each fold has the same distribution of the target variable)
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Step 4: Initialize variables to store performance metrics
accuracies = []
conf_matrices = []
class_reports = []

# Step 5: Perform K-Fold Cross-Validation
for train_index, val_index in kf.split(X, y):
    # Split the data into training and validation sets for this fold
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Apply SMOTE to the training set to handle class imbalance
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    
    # Step 6: Train the logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_smote, y_train_smote)
    
    # Step 7: Predict on the validation set
    y_pred = model.predict(X_val)
    
    # Step 8: Evaluate the model for this fold
    accuracies.append(accuracy_score(y_val, y_pred))
    conf_matrices.append(confusion_matrix(y_val, y_pred))
    class_reports.append(classification_report(y_val, y_pred, output_dict=True))

# Step 9: Calculate average performance metrics across all folds
avg_accuracy = np.mean(accuracies)
avg_conf_matrix = np.mean(conf_matrices, axis=0)
avg_class_report = np.mean([report['accuracy'] for report in class_reports])

# Step 10: Print the results
print("Average Accuracy: {:.4f}".format(avg_accuracy))
print("Average Confusion Matrix:\n", avg_conf_matrix)
print("Average Classification Report (accuracy): {:.4f}".format(avg_class_report))




Average Accuracy: 0.7319
Average Confusion Matrix:
 [[20.5  9.6]
 [ 7.8 27. ]]
Average Classification Report (accuracy): 0.7319


In [151]:
from sklearn.model_selection import GridSearchCV

# Step 1: Define features (X) and target variable (y)
X = df2.drop(columns=['result_pass', 'absences', 'internet'])
y = df2['result_pass']

# Step 2: Initialize the Stratified K-Fold Cross-Validator
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 3: Initialize the SMOTE object
smote = SMOTE(random_state=42)

# Step 4: Define hyperparameter grid for tuning
param_grid = {
    'penalty': ['l1', 'l2'],  # Regularization type
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'saga'],  # Solver to use
    'max_iter': [100, 500, 1000]  # Maximum number of iterations
}

# Step 5: Initialize Logistic Regression model
model = LogisticRegression()

# Step 6: Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=model, 
    param_grid=param_grid, 
    cv=kf,  # Use StratifiedKFold cross-validation
    scoring='accuracy',  # Evaluate based on accuracy
    n_jobs=-1  # Use all available CPU cores
)

# Step 7: Apply SMOTE and fit the GridSearchCV
best_params = None
best_score = 0

for train_index, val_index in kf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Apply SMOTE to the training set
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    
    # Step 8: Fit GridSearchCV on the resampled data
    grid_search.fit(X_train_smote, y_train_smote)
    
    # Step 9: Track the best parameters and score
    if grid_search.best_score_ > best_score:
        best_score = grid_search.best_score_
        best_params = grid_search.best_params_

# Step 10: Print the best hyperparameters and best score
print(f"Best Hyperparameters: {best_params}")
print(f"Best Accuracy: {best_score:.4f}")




Best Hyperparameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Accuracy: 0.7663


