In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from imblearn.over_sampling import SMOTE
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
np.random.seed(0)
num_entries = 10000
def generate_time_percentages(num_entries):
    percentages = np.random.dirichlet(np.ones(3), size=num_entries) * 100
    return percentages
time_percentages = generate_time_percentages(num_entries)
dataset = pd.DataFrame({
    'Hard Problem Solved': np.random.rand(num_entries),
    'Medium Problem Solved': np.random.rand(num_entries),
    'Easy Problem Solved': np.random.rand(num_entries),
    'Time of the test spent on solving Hard questions': time_percentages[:, 0],
    'Time of the test spent on solving Medium questions': time_percentages[:, 1],
    'Time of the test spent on solving Easy questions': time_percentages[:, 2],
    'Hardness level of each test': np.random.randint(1, 6, num_entries)
})
passing_criteria = {
    1: (dataset['Easy Problem Solved'] > 0.8) & (dataset['Medium Problem Solved'] > 0.7) & (dataset['Hard Problem Solved'] > 0.6),
    2: (dataset['Easy Problem Solved'] > 0.75) & (dataset['Medium Problem Solved'] > 0.65) & (dataset['Hard Problem Solved'] > 0.55),
    3: (dataset['Easy Problem Solved'] > 0.65) & (dataset['Medium Problem Solved'] > 0.55) & (dataset['Hard Problem Solved'] > 0.5),
    4: (dataset['Easy Problem Solved'] > 0.6) & (dataset['Medium Problem Solved'] > 0.5) & (dataset['Hard Problem Solved'] > 0.45),
    5: (dataset['Easy Problem Solved'] > 0.55) & (dataset['Medium Problem Solved'] > 0.45) & (dataset['Hard Problem Solved'] > 0.4)
}
for level, criteria in passing_criteria.items():
    dataset.loc[dataset['Hardness level of each test'] == level, 'Result'] = criteria.loc[criteria].astype(int)
dataset['Result'] = dataset['Result'].fillna(0).astype(int)
dataset.head(10)
shuffled_df = dataset.sample(frac=1, random_state=42).reset_index(drop=True)
dataset = shuffled_df

In [None]:
smote = SMOTE()
X = dataset.drop('Result', axis=1)
y = dataset['Result']
X_res, y_res = smote.fit_resample(X, y)
print('Original dataset shape %s' % Counter(y))
print('Resampled dataset shape %s' % Counter(y_res))
resampled_data = pd.DataFrame(X_res, columns=X.columns)
resampled_data['Result'] = y_res
resampled_data.head()
resampled_data.shape

Original dataset shape Counter({0: 9218, 1: 782})
Resampled dataset shape Counter({1: 9218, 0: 9218})


(18436, 8)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
X = resampled_data.drop('Result', axis=1)
y = resampled_data['Result']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_cv_pred = logreg.predict(X_cv)
print('Classification Report for Cross-Validation Set:')
print(classification_report(y_cv, y_cv_pred))
y_test_pred = logreg.predict(X_test)
print('Classification Report for Test Set:')
print(classification_report(y_test, y_test_pred))
conf_matrix = confusion_matrix(y_test, y_test_pred)
print('Confusion Matrix for Test Set:')
print(conf_matrix)

Classification Report for Cross-Validation Set:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94      1829
           1       0.92      0.96      0.94      1858

    accuracy                           0.94      3687
   macro avg       0.94      0.94      0.94      3687
weighted avg       0.94      0.94      0.94      3687

Classification Report for Test Set:
              precision    recall  f1-score   support

           0       0.97      0.89      0.92      1873
           1       0.89      0.97      0.93      1815

    accuracy                           0.93      3688
   macro avg       0.93      0.93      0.93      3688
weighted avg       0.93      0.93      0.93      3688

Confusion Matrix for Test Set:
[[1662  211]
 [  59 1756]]


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_cv_rf_pred = rf_model.predict(X_cv)
gb_model.fit(X_train, y_train)
y_cv_gb_pred = gb_model.predict(X_cv)
rf_report = classification_report(y_cv, y_cv_rf_pred)
gb_report = classification_report(y_cv, y_cv_gb_pred)
print('Random Forest Classification Report for Cross-Validation Set:')
print(rf_report)
print('Gradient Boosting Classification Report for Cross-Validation Set:')
print(gb_report)

Random Forest Classification Report for Cross-Validation Set:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1829
           1       0.99      1.00      0.99      1858

    accuracy                           0.99      3687
   macro avg       0.99      0.99      0.99      3687
weighted avg       0.99      0.99      0.99      3687

Gradient Boosting Classification Report for Cross-Validation Set:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1829
           1       0.98      1.00      0.99      1858

    accuracy                           0.99      3687
   macro avg       0.99      0.99      0.99      3687
weighted avg       0.99      0.99      0.99      3687



In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[('lr', logreg), ('rf', rf_model), ('gb', gb_model)], voting='soft')
voting_clf.fit(X_train, y_train)
y_cv_ensemble_pred = voting_clf.predict(X_cv)
ensemble_report = classification_report(y_cv, y_cv_ensemble_pred)
print('Ensemble Method Classification Report for Cross-Validation Set:')
print(ensemble_report)

Ensemble Method Classification Report for Cross-Validation Set:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1829
           1       0.98      1.00      0.99      1858

    accuracy                           0.99      3687
   macro avg       0.99      0.99      0.99      3687
weighted avg       0.99      0.99      0.99      3687



In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import accuracy_score
# file_path = '/content/Clustered_Student_Performance.csv'
# data = pd.read_csv(file_path)
# X = data[['Hardness of Skill Tested', 'Percentage of Hard Questions Correct', 'Percentage of Medium Questions Correct', 'Percentage of Easy Questions Correct', 'Accuracy on Hard Questions', 'Accuracy on Medium Questions', 'Accuracy on Easy Questions', 'Percentage of Time Left on Clock']]
# y = data['Passed']

# # Split the data into training and test sets (70% train, 30% test)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Initialize the MLPClassifier
# mlp = MLPClassifier(random_state=42)

# mlp.fit(X_train, y_train)

# y_pred = mlp.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)

# print('Neural Network Model Accuracy: {:.2f}%'.format(accuracy * 100))
mport pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

tqdm.pandas()

file_path = 'Clustered_Student_Performance.csv'
data = pd.read_csv(file_path)

X = data[['Hardness of Skill Tested', 'Percentage of Hard Questions Correct', 'Percentage of Medium Questions Correct', 'Percentage of Easy Questions Correct', 'Accuracy on Hard Questions', 'Accuracy on Medium Questions', 'Accuracy on Easy Questions', 'Percentage of Time Left on Clock']].values
y = data['Passed'].values

y = to_categorical(y, num_classes=2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=100, batch_size=10, verbose=0)

scores = model.evaluate(X_test, y_test, verbose=0)
accuracy = scores[1] * 100

print('Neural Network Model Accuracy with TensorFlow/Keras: {:.2f}%'.format(accuracy))

In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Extracting true positives, true negatives, false positives, and false negatives
true_positive = conf_matrix[1, 1]
true_negative = conf_matrix[0, 0]
false_positive = conf_matrix[0, 1]
false_negative = conf_matrix[1, 0]

# Print the values
print('True Positives:', true_positive)
print('True Negatives:', true_negative)
print('False Positives:', false_positive)
print('False Negatives:', false_negative)

True Positives: 610
True Negatives: 2228
False Positives: 31
False Negatives: 131
