In [38]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import log_loss,accuracy_score,precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv(r'C:\Users\sanja\Downloads\archive\dataset.csv')
df = df.dropna()

# Create a copy to leave the original dataset unchanged
students_df = df.copy() 

# Change the Target to Numerical Value 0 or 1. Predicting whether a student will graduate or dropout
students_df.replace("Dropout",1,inplace=True) 
students_df.replace("Graduate",0,inplace=True)
#students_df.replace("Enrolled",-1,inplace=True)
students_df.drop(students_df[students_df["Target"]=="Enrolled"].index,inplace=True)

# Split the data into training and test sets
y = students_df["Target"]
y = y.astype('int')
X = students_df.iloc[:,:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Create the models
log_reg = LogisticRegression()
svm = SVC()
dtc = DecisionTreeClassifier()

# Fit the models on the training data
log_reg.fit(X_train, y_train)
svm.fit(X_train, y_train)
dtc.fit(X_train, y_train)

# Make predictions on the testing data
log_reg_y_pred = log_reg.predict(X_test)
svm_y_pred = svm.predict(X_test)
dtc_y_pred = dtc.predict(X_test)

# Calculate evaluation metrics
log_reg_accuracy = accuracy_score(y_test, log_reg_y_pred)
log_reg_precision = precision_score(y_test, log_reg_y_pred)
log_reg_recall = recall_score(y_test, log_reg_y_pred)
log_reg_f1 = f1_score(y_test, log_reg_y_pred)

svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_precision = precision_score(y_test, svm_y_pred)
svm_recall = recall_score(y_test, svm_y_pred)
svm_f1 = f1_score(y_test, svm_y_pred)

dtc_accuracy = accuracy_score(y_test, dtc_y_pred)
dtc_precision = precision_score(y_test, dtc_y_pred)
dtc_recall = recall_score(y_test, dtc_y_pred)
dtc_f1 = f1_score(y_test, dtc_y_pred)

# Print evaluation metrics
print("Logistic Regression:")
print("Accuracy Score: ", log_reg_accuracy)
print("Precision Score: ", log_reg_precision)
print("Recall Score: ", log_reg_recall)
print("F1 Score: ", log_reg_f1)

print("\nSVM:")
print("Accuracy Score: ", svm_accuracy)
print("Precision Score: ", svm_precision)
print("Recall Score: ", svm_recall)
print("F1 Score: ", svm_f1)

print("\nDecision Tree:")
print("Accuracy Score: ", dtc_accuracy)
print("Precision Score: ", dtc_precision)
print("Recall Score: ", dtc_recall)
print("F1 Score: ", dtc_f1)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression:
Accuracy Score:  0.9164370982552801
Precision Score:  0.9314720812182741
Recall Score:  0.851508120649652
F1 Score:  0.8896969696969697

SVM:
Accuracy Score:  0.8953168044077136
Precision Score:  0.9464788732394366
Recall Score:  0.7795823665893271
F1 Score:  0.8549618320610687

Decision Tree:
Accuracy Score:  0.8668503213957759
Precision Score:  0.8404761904761905
Recall Score:  0.8190255220417634
F1 Score:  0.8296122209165687


In [42]:
# Define the parameter grid for SVM
parameter_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': [0.1, 1, 10]}

# Create a GridSearchCV object
svm_grid = GridSearchCV(SVC(), parameter_grid, scoring='accuracy', cv=5, n_jobs=-1)

# Fit the GridSearchCV object on the training data
svm_grid.fit(X_train, y_train)

# Make predictions on the testing data using the best model
svm_y_pred = svm_grid.predict(X_test)

# Calculate evaluation metrics
svm_grid_accuracy = accuracy_score(y_test, svm_y_pred)
svm_grid_precision = precision_score(y_test, svm_y_pred)
svm_grid_recall = recall_score(y_test, svm_y_pred)
svm_grid_f1 = f1_score(y_test, svm_y_pred)

# Print evaluation metrics
print("\nSVM with GridSearch:")
print("Accuracy Score: ", svm_grid_accuracy)
print("Precision Score: ", svm_grid_precision)
print("Recall Score: ", svm_grid_recall)
print("F1 Score: ", svm_grid_f1)

# Print the best score and parameters
print("Best Score: ", svm_grid.best_score_)
print("Best Parameters: ", svm_grid.best_params_)

# Define the parameter grid for Decision Tree
parameter_grid = {'max_depth': [1, 5, 10, 15], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}

# Create a GridSearchCV object
dtree_grid = GridSearchCV(DecisionTreeClassifier(), parameter_grid, scoring='accuracy', cv=5, n_jobs=-1)

# Fit the GridSearchCV object on the training data
dtree_grid.fit(X_train, y_train)

# Make predictions on the testing data using the best model
dtree_y_pred = dtree_grid.predict(X_test)

# Calculate evaluation metrics
dtree_grid_accuracy = accuracy_score(y_test, dtree_y_pred)
dtree_grid_precision = precision_score(y_test, dtree_y_pred)
dtree_grid_recall = recall_score(y_test, dtree_y_pred)
dtree_grid_f1 = f1_score(y_test, dtree_y_pred)

# Print evaluation metrics
print("\nDecision Tree with GridSearch:")
print("Accuracy Score: ", dtree_grid_accuracy)
print("Precision Score: ", dtree_grid_precision)
print("Recall Score: ", dtree_grid_recall)
print("F1 Score: ", dtree_grid_f1)

# Print the best score and parameters
print("Best Score: ", dtree_grid.best_score_)
print("Best Parameters: ", dtree_grid.best_params_)

# Define the parameter grid for logistic regression
parameter_grid = {'C': [0.1, 1, 5], 'penalty': ['l1', 'l2']}

# Create a GridSearchCV object
lr_grid = GridSearchCV(LogisticRegression(), parameter_grid, scoring='accuracy', cv=5, n_jobs=-1)

# Fit the GridSearchCV object on the training data
lr_grid.fit(X_train, y_train)

# Make predictions on the testing data using the best model
lr_y_pred = lr_grid.predict(X_test)

# Calculate evaluation metrics
lr_accuracy = accuracy_score(y_test, lr_y_pred)
lr_precision = precision_score(y_test, lr_y_pred)
lr_recall = recall_score(y_test, lr_y_pred)
lr_f1 = f1_score(y_test, lr_y_pred)

# Print evaluation metrics
print("\nLogistic Regression with GridSearch:")
print("Accuracy Score: ", lr_accuracy)
print("Precision Score: ", lr_precision)
print("Recall Score: ", lr_recall)
print("F1 Score: ", lr_f1)

# Print the best score and parameters
print("Best Score: ", lr_grid.best_score_)
print("Best Parameters: ", lr_grid.best_params_)



SVM with GridSearch:
Accuracy Score:  0.9173553719008265
Precision Score:  0.9428571428571428
Recall Score:  0.8422273781902552
F1 Score:  0.8897058823529411
Best Score:  0.9082963352567177
Best Parameters:  {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}

Decision Tree with GridSearch:
Accuracy Score:  0.8888888888888888
Precision Score:  0.9122340425531915
Recall Score:  0.7958236658932715
F1 Score:  0.8500619578686494
Best Score:  0.8843014711569698
Best Parameters:  {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 5}

Logistic Regression with GridSearch:
Accuracy Score:  0.9164370982552801
Precision Score:  0.9314720812182741
Recall Score:  0.851508120649652
F1 Score:  0.8896969696969697
Best Score:  0.9118396423433317
Best Parameters:  {'C': 1, 'penalty': 'l2'}


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sanja\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sanja\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\sanja\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

STOP: TOTAL NO. of ITERATION

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Create a decision tree classifier object
tree = DecisionTreeClassifier()

# Train the model on the training data
tree.fit(X_train, y_train)

# Plot the decision tree
plt.figure(figsize=(215,210))
plot_tree(tree, filled=True, feature_names=X.columns, class_names=['Graduate', 'Dropout'])
plt.show()

In [None]:
# Define the metrics to plot
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

# Define the scores for each model and metric
log_reg_scores = [accuracy_score(y_test, log_reg_y_pred), 
                  precision_score(y_test, log_reg_y_pred), 
                  recall_score(y_test, log_reg_y_pred), 
                  f1_score(y_test, log_reg_y_pred)]

svm_scores = [accuracy_score(y_test, svm_y_pred), 
              precision_score(y_test, svm_y_pred), 
              recall_score(y_test, svm_y_pred), 
              f1_score(y_test, svm_y_pred)]

tree_scores = [accuracy_score(y_test, dtc_y_pred), 
               precision_score(y_test, dtc_y_pred), 
               recall_score(y_test, dtc_y_pred), 
               f1_score(y_test, dtc_y_pred)]

# Plot the bar chart
x = np.arange(len(metrics))
width = 0.25
fig, ax = plt.subplots(figsize=(15, 9))
ax.bar(x - width, log_reg_scores, width, label='Logistic Regression')
ax.bar(x, svm_scores, width, label='SVM')
ax.bar(x + width, tree_scores, width, label='Decision Tree')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylabel('Score')
ax.set_title('Comparison of Model Performance')
plt.show()


In [None]:
import seaborn as sns

# Plot histogram of features
sns.set(style="ticks", color_codes=True)
sns.histplot(data=df, x="Age at enrollment", hue="Target", kde=True)
plt.title("Age Distribution")
plt.show()


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
pca = PCA(n_components=2)
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)

pca_df = pd.DataFrame(data = X_pca, columns = ['PC1', 'PC2'])
pca_df['Target'] = y_train.values

plt.figure(figsize=(8,6))
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=pca_df['Target'])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Scatter Plot')
plt.show()

In [None]:
# Get the principal components and their weights
pcs = pca.components_
weights = np.abs(pcs)

# Find the component with the highest contribution
highest_contrib_comp = np.argmax(weights.sum(axis=1))

# Print the column names and weights for the highest contributing component

print("Weights: ", pcs[highest_contrib_comp])

In [None]:
# Finding highest contributor to the PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
pca = PCA(n_components=2)
pca.fit(X_train_scaled)
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=X_train.columns)
contributions = pd.DataFrame(np.abs(loadings) * pca.explained_variance_ratio_, columns=['PC1', 'PC2'], index=X_train.columns)
total_contributions = contributions.sum(axis=1)
highest_contributor = total_contributions.idxmax()
print(f'The column with the highest contribution to PCA is {highest_contributor}, with a total contribution of {total_contributions[highest_contributor]:.3f}')

In [None]:
labels = ['Regular SVM', 'GridSearchCV SVM']
accuracy_scores = [svm_accuracy, svm_grid_accuracy]
plt.bar(labels, accuracy_scores, width=0.4, color='green')
for i, v in enumerate(accuracy_scores):
    plt.text(i - 0.12, v + 0.02, str(round(v, 3)), fontweight='bold')
plt.ylim([0, 1])
plt.title('Accuracy Scores Comparison')
plt.show()
print(accuracy_scores)