In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

In [None]:
# Read the dataset
df = pd.read_csv('/Users/Frankie_C/Documents/GitHub/Academic-Success-Predictor/data/df_vif_cleaned.csv')

# Separate features (x) and target (y)
x = df.drop('result_pass', axis = 1s)
y = df['result_pass']

# Define K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a RandomForest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(x, y)

# Perform Cross-Validation
cv_scores = cross_val_score(rf_model, x, y, cv=kf, scoring="accuracy")

# Compute mean accuracy
mean_accuracy = np.mean(cv_scores)
print(f"🗝️ Cross-validation accuracy scores:{cv_scores}")
print(f"🔑 Mean accuracy:{mean_accuracy}")


🗝️ Cross-validation accuracy scores:[0.72307692 0.76153846 0.71538462 0.74615385 0.6744186 ]
🔑 Mean accuracy:0.7241144901610018


In [None]:
from sklearn.model_selection import GridSearchCV

# Read the dataset
df = pd.read_csv('/Users/Frankie_C/Documents/GitHub/Academic-Success-Predictor/data/df_vif_cleaned.csv')

# Separate features (x) and target (y)
x = df.drop(columns=['result_pass'])
y = df['result_pass']

# Define K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid for tuning
param_grid = {
    "n_estimators": [100, 200, 300],          # Number of trees
    "max_depth": [10, 20, None],              # Tree depth limit
    "min_samples_split": [2, 5, 10],          # Minimum samples to split a node
    "min_samples_leaf": [1, 2, 4],            # Minimum samples per leaf
    "max_features": ["sqrt", "log2"]          # Features considered per split
}

# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(rf_model, param_grid, cv=kf, scoring="accuracy")
grid_search.fit(x, y)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Cross-Validation Accuracy:", best_score)

Best Hyperparameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}
Best Cross-Validation Accuracy: 0.7457364341085271


In [10]:
from sklearn.model_selection import GridSearchCV

# Read the dataset
df = pd.read_csv('/Users/Frankie_C/Documents/GitHub/Academic-Success-Predictor/data/df_vif_cleaned.csv')

# Separate features (x) and target (y)
x = df.drop(columns=['result_pass'])
y = df['result_pass']

# Define K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid for tuning
param_grid = {
    "n_estimators": [300, 500],          # Number of trees
    "max_depth": [10, 20, 30],              # Tree depth limit
    "min_samples_split": [2, 5, 10, 15],          # Minimum samples to split a node
    "min_samples_leaf": [1, 2, 4, 6],            # Minimum samples per leaf
    "max_features": ["sqrt", "log2"]          # Features considered per split
}

# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(rf_model, param_grid, cv=kf, scoring="accuracy")
grid_search.fit(x, y)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Cross-Validation Accuracy:", best_score)

KeyboardInterrupt: 