In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

import pandas as pd

# Load the file
file_path = 'C:/Users/Prachi/Documents/Data Science Masters/Thesis/Code/Experiment 3/student_score_per_topic_with_total.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows to understand the structure
data.head()

# Define features and target
X = data.drop(columns=['Total', 'Total_percentage', 'Pass'])
y = data['Pass']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply Logistic Regression with L1 regularization (Lasso)
logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
logreg_l1.fit(X_train_scaled, y_train)

# Get the feature importance (coefficients)
feature_importance = np.abs(logreg_l1.coef_[0])

# Map the importance back to the feature names
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                                  Feature  Importance
1                                  Search    2.290782
5                Knowledge_Representation    2.037486
4                                   Logic    1.975297
6                                Planning    1.858909
3  Constraint_Satisfaction_Or_Propagation    1.795093
2                      Adversarial_Search    1.144585
0                                  Prolog    0.634842


In [5]:
from sklearn.feature_selection import RFE

# Perform Recursive Feature Elimination with Logistic Regression
rfe_selector = RFE(estimator=LogisticRegression(solver='liblinear', random_state=42), n_features_to_select=1)
rfe_selector.fit(X_train_scaled, y_train)

# Get RFE rankings
rfe_ranking = pd.DataFrame({
    'Feature': X.columns,
    'RFE_Ranking': rfe_selector.ranking_
}).sort_values(by='RFE_Ranking')


# Display the RFE ranking
print(rfe_ranking)


                                  Feature  RFE_Ranking
1                                  Search            1
5                Knowledge_Representation            2
3  Constraint_Satisfaction_Or_Propagation            3
4                                   Logic            4
6                                Planning            5
2                      Adversarial_Search            6
0                                  Prolog            7


In [6]:
# Step 1: Select top features from L1 regularization and RFE
top_features_l1 = ['Search', 'Knowledge_Representation', 'Logic']  # Replace with your actual top features from L1
top_features_rfe = ['Search', 'Logic', 'Planning']  # Replace with your actual top features from RFE

# Combine these features (you can decide whether to take a union or intersection of the features)
selected_features = list(set(top_features_l1).union(set(top_features_rfe)))  # Union of both sets

# Step 2: Create a reduced feature set based on the selected features
X_selected = X[selected_features]

# Step 3: Train a Logistic Regression model using the reduced feature set
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Evaluate the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Accuracy: 0.9555555555555556
Precision: 0.9574468085106383
Recall: 0.9183673469387755
F1 Score: 0.9375


In [24]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



# Define features and target (using only top 2 features)
top_2_features = ['Search', 'Knowledge_Representation', 'Logic', 'Planning']
#top_2_features = ['Search', 'Knowledge_Representation']
X = data[top_2_features]
y = data['Pass']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Normal Logistic Regression (no regularization)
logreg_normal = LogisticRegression( random_state=42)
logreg_normal.fit(X_train_scaled, y_train)
y_pred_normal = logreg_normal.predict(X_test_scaled)

# Logistic Regression with L1 (Lasso)
logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
logreg_l1.fit(X_train_scaled, y_train)
y_pred_l1 = logreg_l1.predict(X_test_scaled)

# Logistic Regression with L2 (Ridge)
logreg_l2 = LogisticRegression(penalty='l2', solver='lbfgs', random_state=42)
logreg_l2.fit(X_train_scaled, y_train)
y_pred_l2 = logreg_l2.predict(X_test_scaled)

# Decision Tree model
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Calculate metrics for Normal Logistic Regression
precision_normal = precision_score(y_test, y_pred_normal)
recall_normal = recall_score(y_test, y_pred_normal)
accuracy_normal = accuracy_score(y_test, y_pred_normal)
f1_normal = f1_score(y_test, y_pred_normal)

# Calculate metrics for L1 Logistic Regression
precision_l1 = precision_score(y_test, y_pred_l1)
recall_l1 = recall_score(y_test, y_pred_l1)
accuracy_l1 = accuracy_score(y_test, y_pred_l1)
f1_l1 = f1_score(y_test, y_pred_l1)

# Calculate metrics for L2 Logistic Regression
precision_l2 = precision_score(y_test, y_pred_l2)
recall_l2 = recall_score(y_test, y_pred_l2)
accuracy_l2 = accuracy_score(y_test, y_pred_l2)
f1_l2 = f1_score(y_test, y_pred_l2)

# Calculate metrics for Decision Tree
precision_tree = precision_score(y_test, y_pred_tree)
recall_tree = recall_score(y_test, y_pred_tree)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
f1_tree = f1_score(y_test, y_pred_tree)

# Calculate metrics for Random Forest
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

# Collect results in a DataFrame to display
results_df = pd.DataFrame({
    'Model': ['Normal Logistic Regression', 'L1 Logistic Regression ', 'L2 Logistic Regression', 'Decision Tree', 'Random Forest'],
    'Accuracy': [accuracy_normal, accuracy_l1, accuracy_l2, accuracy_tree, accuracy_rf],
    'Precision': [precision_normal, precision_l1, precision_l2, precision_tree, precision_rf],
    'Recall': [recall_normal, recall_l1, recall_l2, recall_tree, recall_rf],
    'F1 Score': [f1_normal, f1_l1, f1_l2, f1_tree, f1_rf]
})

# Display the results to the user

# Display the DataFrame for reference
print(results_df)


                        Model  Accuracy  Precision    Recall  F1 Score
0  Normal Logistic Regression  0.955556   0.957447  0.918367  0.937500
1     L1 Logistic Regression   0.955556   0.938776  0.938776  0.938776
2      L2 Logistic Regression  0.955556   0.957447  0.918367  0.937500
3               Decision Tree  0.903704   0.891304  0.836735  0.863158
4               Random Forest  0.925926   0.914894  0.877551  0.895833


In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

# Hyperparameter tuning for Logistic Regression with GridSearchCV
param_grid_lr = {'C': [0.01, 0.1, 1, 10, 100]}
logreg_l2_tuned = GridSearchCV(LogisticRegression(penalty='l2', solver='lbfgs', random_state=42), param_grid_lr, cv=5)
logreg_l2_tuned.fit(X_train_scaled, y_train)
y_pred_l2_tuned = logreg_l2_tuned.predict(X_test_scaled)

# Hyperparameter tuning for Decision Tree with GridSearchCV
param_grid_tree = {'max_depth': [3, 5, 7, 10], 'min_samples_split': [2, 5, 10]}
tree_tuned = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_tree, cv=5)
tree_tuned.fit(X_train, y_train)
y_pred_tree_tuned = tree_tuned.predict(X_test)

# Random Forest hyperparameter tuning
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
rf_tuned = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5)
rf_tuned.fit(X_train, y_train)
y_pred_rf_tuned = rf_tuned.predict(X_test)

# Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

# Evaluate the tuned models and Gradient Boosting
models = {
    'Tuned L2 Logistic Regression': y_pred_l2_tuned,
    'Tuned Decision Tree': y_pred_tree_tuned,
    'Tuned Random Forest': y_pred_rf_tuned,
    'Gradient Boosting': y_pred_gb
}

results_improved_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

for model_name, y_pred in models.items():
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results_improved_df = results_improved_df.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }, ignore_index=True)

# Display the improved results
print(results_improved_df)
print(results_df)

                          Model  Accuracy  Precision    Recall  F1 Score
0  Tuned L2 Logistic Regression  0.955556   0.957447  0.918367  0.937500
1           Tuned Decision Tree  0.903704   0.891304  0.836735  0.863158
2           Tuned Random Forest  0.933333   0.934783  0.877551  0.905263
3             Gradient Boosting  0.911111   0.893617  0.857143  0.875000
                        Model  Accuracy  Precision    Recall  F1 Score
0  Normal Logistic Regression  0.955556   0.957447  0.918367  0.937500
1     L1 Logistic Regression   0.955556   0.938776  0.938776  0.938776
2      L2 Logistic Regression  0.955556   0.957447  0.918367  0.937500
3               Decision Tree  0.903704   0.891304  0.836735  0.863158
4               Random Forest  0.925926   0.914894  0.877551  0.895833


  results_improved_df = results_improved_df.append({
  results_improved_df = results_improved_df.append({
  results_improved_df = results_improved_df.append({
  results_improved_df = results_improved_df.append({
