In [2]:
# Let's start by loading the uploaded file and inspecting its contents to understand the structure of the data.
import pandas as pd

# Load the uploaded file
data = pd.read_excel('C:/Users/Prachi/Documents/Data Science Masters/Thesis/student_score_per_topic_with_total.xlsx')


# Display the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,Prolog,Search,Adversarial_Search,Constraint_Satisfaction_Or_Propagation,Logic,Knowledge_Representation,Planning,Total,Total_percentage,Pass
0,1.5,5.5,0.0,5.0,2.0,4.5,2.0,20.5,24.117647,0
1,0.0,2.0,2.0,0.5,0.0,0.0,0.0,4.5,5.294118,0
2,0.0,6.5,4.0,3.0,5.0,5.0,3.5,27.0,31.764706,0
3,0.0,9.0,5.0,7.0,7.0,2.5,8.0,38.5,45.294118,0
4,2.5,8.0,5.0,11.5,4.0,7.0,7.5,45.5,53.529412,1


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Data Preparation
# Define features (all topic scores) and target (Total score)
features = ['Prolog', 'Search', 'Adversarial_Search', 'Constraint_Satisfaction_Or_Propagation', 'Logic', 'Knowledge_Representation', 'Planning']
X = data[features]
y = data['Total']

# Split the data into training and test sets (70% training, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 2: Model Development

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

# Decision Tree Regressor
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Step 3: Model Evaluation
# Calculate evaluation metrics for each model

# Linear Regression Metrics
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

# Decision Tree Regressor Metrics
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

# Random Forest Regressor Metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Collect results in a DataFrame to display
results_regression_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree Regressor', 'Random Forest Regressor'],
    'Mean Squared Error (MSE)': [mse_linear, mse_tree, mse_rf],
    'R-squared (R2)': [r2_linear, r2_tree, r2_rf]
})


# Display the DataFrame for reference
results_regression_df


Unnamed: 0,Model,Mean Squared Error (MSE),R-squared (R2)
0,Linear Regression,8.908869e-29,1.0
1,Decision Tree Regressor,20.58889,0.924718
2,Random Forest Regressor,6.280457,0.977036


In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pandas as pd

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Linear Regression on the standardized features
linear_model_temp = LinearRegression()
linear_model_temp.fit(X_scaled, y)

# Get the absolute values of the coefficients to determine importance
feature_importance = abs(linear_model_temp.coef_)

# Create a DataFrame to rank the features
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)


                                  Feature  Importance
1                                  Search    3.832063
4                                   Logic    3.777177
6                                Planning    3.426629
5                Knowledge_Representation    3.356420
3  Constraint_Satisfaction_Or_Propagation    3.252547
2                      Adversarial_Search    1.571657
0                                  Prolog    1.395366


In [19]:
# Step 1: Feature Selection - Choosing the top 3 features based on Linear Regression coefficients
# Fit a Linear Regression model to determine feature importance
linear_model_temp = LinearRegression()
linear_model_temp.fit(X_train, y_train)

# Get the absolute values of the coefficients to determine importance
feature_importance = abs(linear_model_temp.coef_)

# Create a DataFrame to rank the features
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

# Select the top 3 features
top_3_features = feature_importance_df['Feature'].head(3).tolist()

# Step 2: Data Preparation with Top 3 Features
# Define features and target with the top 3 features
X_top_3 = data[top_3_features]

# Split the data into training and test sets (70% training, 30% test)
X_train_top_3, X_test_top_3, y_train_top_3, y_test_top_3 = train_test_split(X_top_3, y, test_size=0.3, random_state=42)

# Step 3: Model Development with Top 3 Features

# Linear Regression with Top 3 Features
linear_model_top_3 = LinearRegression()
linear_model_top_3.fit(X_train_top_3, y_train_top_3)
y_pred_linear_top_3 = linear_model_top_3.predict(X_test_top_3)

# Decision Tree Regressor with Top 3 Features
tree_model_top_3 = DecisionTreeRegressor(random_state=42)
tree_model_top_3.fit(X_train_top_3, y_train_top_3)
y_pred_tree_top_3 = tree_model_top_3.predict(X_test_top_3)

# Random Forest Regressor with Top 3 Features
rf_model_top_3 = RandomForestRegressor(random_state=42)
rf_model_top_3.fit(X_train_top_3, y_train_top_3)
y_pred_rf_top_3 = rf_model_top_3.predict(X_test_top_3)

# Step 4: Model Evaluation with Top 3 Features
# Calculate evaluation metrics for each model

# Linear Regression Metrics with Top 3 Features
mse_linear_top_3 = mean_squared_error(y_test_top_3, y_pred_linear_top_3)
r2_linear_top_3 = r2_score(y_test_top_3, y_pred_linear_top_3)

# Decision Tree Regressor Metrics with Top 3 Features
mse_tree_top_3 = mean_squared_error(y_test_top_3, y_pred_tree_top_3)
r2_tree_top_3 = r2_score(y_test_top_3, y_pred_tree_top_3)

# Random Forest Regressor Metrics with Top 3 Features
mse_rf_top_3 = mean_squared_error(y_test_top_3, y_pred_rf_top_3)
r2_rf_top_3 = r2_score(y_test_top_3, y_pred_rf_top_3)

# Collect results in a DataFrame to display
results_regression_top_3_df = pd.DataFrame({
    'Model': ['Linear Regression (Top 3 Features)', 'Decision Tree Regressor (Top 3 Features)', 'Random Forest Regressor (Top 3 Features)'],
    'Mean Squared Error (MSE)': [mse_linear_top_3, mse_tree_top_3, mse_rf_top_3],
    'R-squared (R2)': [r2_linear_top_3, r2_tree_top_3, r2_rf_top_3]
})


# Display the DataFrame for reference
print(results_regression_top_3_df)


                                      Model  Mean Squared Error (MSE)  \
0        Linear Regression (Top 3 Features)                 17.084974   
1  Decision Tree Regressor (Top 3 Features)                 40.848148   
2  Random Forest Regressor (Top 3 Features)                 24.414088   

   R-squared (R2)  
0        0.937530  
1        0.850641  
2        0.910731  


In [20]:
from sklearn.model_selection import GridSearchCV

# Step 1: Define the hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Step 2: Set up GridSearchCV for Random Forest Regressor
grid_search_rf = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid_rf,
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Step 3: Fit the model with the top 3 features
grid_search_rf.fit(X_train_top_3, y_train_top_3)

# Step 4: Get the best parameters and evaluate the best model on the test set
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf_tuned = best_rf_model.predict(X_test_top_3)

# Calculate evaluation metrics for the tuned Random Forest model
mse_rf_tuned = mean_squared_error(y_test_top_3, y_pred_rf_tuned)
r2_rf_tuned = r2_score(y_test_top_3, y_pred_rf_tuned)

# Display the best parameters and model performance
best_params = grid_search_rf.best_params_
results_rf_tuned_df = pd.DataFrame({
    'Model': ['Tuned Random Forest Regressor (Top 3 Features)'],
    'Mean Squared Error (MSE)': [mse_rf_tuned],
    'R-squared (R2)': [r2_rf_tuned]
})


# Display the best parameters and the DataFrame for reference
print(best_params, results_rf_tuned_df)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}                                             Model  Mean Squared Error (MSE)  \
0  Tuned Random Forest Regressor (Top 3 Features)                 24.972342   

   R-squared (R2)  
0         0.90869  


In [21]:
# Re-import necessary libraries since the Python environment has restarted
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Define features and target (top 3 features from previous analysis)
top_3_features = ['Search', 'Knowledge_Representation', 'Logic']
X_top_3 = data[top_3_features]
y = data['Total']

# Split the data into training and test sets
X_train_top_3, X_test_top_3, y_train_top_3, y_test_top_3 = train_test_split(X_top_3, y, test_size=0.3, random_state=42)

# Define the hyperparameter distribution for Random Forest
param_dist_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Set up RandomizedSearchCV for Random Forest Regressor
random_search_rf = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=20,  # Number of parameter settings that are sampled
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit the model with the top 3 features
random_search_rf.fit(X_train_top_3, y_train_top_3)

# Get the best parameters and evaluate the best model on the test set
best_rf_model_random = random_search_rf.best_estimator_
y_pred_rf_random_tuned = best_rf_model_random.predict(X_test_top_3)

# Calculate evaluation metrics for the tuned Random Forest model
mse_rf_random_tuned = mean_squared_error(y_test_top_3, y_pred_rf_random_tuned)
r2_rf_random_tuned = r2_score(y_test_top_3, y_pred_rf_random_tuned)

# Display the best parameters and model performance
best_params_random = random_search_rf.best_params_
results_rf_random_tuned_df = pd.DataFrame({
    'Model': ['Randomized Tuned Random Forest Regressor (Top 3 Features)'],
    'Mean Squared Error (MSE)': [mse_rf_random_tuned],
    'R-squared (R2)': [r2_rf_random_tuned]
})


# Display the best parameters and the DataFrame for reference
print(best_params_random, results_rf_random_tuned_df)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
{'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 10, 'bootstrap': True}                                                Model  \
0  Randomized Tuned Random Forest Regressor (Top ...   

   Mean Squared Error (MSE)  R-squared (R2)  
0                 25.605884        0.906373  


In [22]:
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

# Step 1: Train Gradient Boosting Regressor with default parameters for comparison
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train_top_3, y_train_top_3)
y_pred_gb = gb_model.predict(X_test_top_3)

# Step 2: Evaluate the Gradient Boosting Regressor
mse_gb = mean_squared_error(y_test_top_3, y_pred_gb)
r2_gb = r2_score(y_test_top_3, y_pred_gb)

# Step 3: Train Random Forest with all available features for comparison
X_all_features = data[features]
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all_features, y, test_size=0.3, random_state=42)

rf_model_all_features = RandomForestRegressor(**best_params, random_state=42)
rf_model_all_features.fit(X_train_all, y_train_all)
y_pred_rf_all_features = rf_model_all_features.predict(X_test_all)

# Evaluate the Random Forest Regressor trained with all features
mse_rf_all_features = mean_squared_error(y_test_all, y_pred_rf_all_features)
r2_rf_all_features = r2_score(y_test_all, y_pred_rf_all_features)

# Step 4: Collect results in a DataFrame to display
results_improvement_df = pd.DataFrame({
    'Model': [
        'Tuned Random Forest Regressor (Top 3 Features)',
        'Gradient Boosting Regressor (Top 3 Features)',
        'Tuned Random Forest Regressor (All Features)'
    ],
    'Mean Squared Error (MSE)': [mse_rf_tuned, mse_gb, mse_rf_all_features],
    'R-squared (R2)': [r2_rf_tuned, r2_gb, r2_rf_all_features]
})

# Display the results to the user

# Display the DataFrame for reference
print(results_improvement_df)


                                            Model  Mean Squared Error (MSE)  \
0  Tuned Random Forest Regressor (Top 3 Features)                 24.972342   
1    Gradient Boosting Regressor (Top 3 Features)                 25.283464   
2    Tuned Random Forest Regressor (All Features)                  8.687915   

   R-squared (R2)  
0        0.908690  
1        0.907552  
2        0.968233  


In [15]:
linear_model_temp = LinearRegression()
linear_model_temp.fit(X_train, y_train)

# Get the absolute values of the coefficients to determine importance
feature_importance = linear_model_temp.coef_

# Create a DataFrame to rank the features
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                                  Feature  Importance
6                                Planning         1.0
4                                   Logic         1.0
1                                  Search         1.0
2                      Adversarial_Search         1.0
5                Knowledge_Representation         1.0
3  Constraint_Satisfaction_Or_Propagation         1.0
0                                  Prolog         1.0
