<a href="https://colab.research.google.com/github/OmarAbdelaziz0/StudentGradesAndPrograms/blob/main/OneScriptStudentsandPrograms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive and import necessary libraries
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Mount the drive
drive.mount('/content/drive')

# File path for the dataset (update to match your Google Drive location)
file_path = '/content/drive/MyDrive/StudentGradesAndPrograms.csv'

# Load the dataset with exception handling
try:
    df = pd.read_csv(file_path)
    print("Dataset successfully loaded.")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit()
except pd.errors.ParserError:
    print(f"Error: Could not parse the CSV file at {file_path}. Check the file format.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    exit()

# Display dataset information
print("First 5 rows of the dataset:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nDataset description:")
print(df.describe())
print("\nMissing values:\n", df.isnull().sum())

# Preprocessing: Handle missing values and duplicates
df = df.dropna()
df = df.drop_duplicates()

# One-Hot Encoding for categorical features
df = pd.get_dummies(df, drop_first=True)

# Normalize numerical data
numerical_cols = df.select_dtypes(include=['number']).columns
scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Separate features (X) and target (y)
target_column = 'gradePercentage'  # Update to match the actual target column name
if target_column not in df.columns:
    print(f"Error: Target column '{target_column}' not found in dataset.")
    exit()

X = df.drop(target_column, axis=1)
y = df[target_column]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression Model
print("\n--- Linear Regression Model ---")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression - Mean Squared Error: {mse_lr}")
print(f"Linear Regression - R-squared: {r2_lr}")

# Random Forest Regressor with Hyperparameter Tuning
print("\n--- Random Forest Regressor with Hyperparameter Tuning ---")
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model and evaluation
best_rf_model = grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Random Forest - Mean Squared Error: {mse_rf}")
print(f"Random Forest - R-squared: {r2_rf}")

# Visualization: Actual vs Predicted for Random Forest
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred_rf)
plt.xlabel("Actual Grade Percentage")
plt.ylabel("Predicted Grade Percentage")
plt.title("Actual vs Predicted Grade Percentage (Random Forest)")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.show()

# Residual Plot
residuals = y_test - y_pred_rf
plt.figure(figsize=(8, 6))
sns.residplot(x=y_pred_rf, y=residuals, lowess=True)
plt.xlabel("Predicted Grade Percentage")
plt.ylabel("Residuals")
plt.title("Residual Plot (Random Forest)")
plt.show()

# Feature Importance
feature_importances = best_rf_model.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance (Random Forest)')
plt.show()


Mounted at /content/drive


  df = pd.read_csv(file_path)


Dataset successfully loaded.
First 5 rows of the dataset:
  schoolyear gradeLevel classPeriod classType        schoolName  \
0  2024-2025         07           1       ELE  West Junior High   
1  2024-2025         07           1       ELE  West Junior High   
2  2024-2025         07           1       ELE  West Junior High   
3  2024-2025         07           1       ELE  West Junior High   
4  2024-2025         07           1       ELE  West Junior High   

   gradePercentage avid sped migrant ell student_ID  
0           2000.0    Y    N       N   N  0HRJHI993  
1           2000.0    N    N       N   N  CKN322II4  
2           1950.0    N    N       N   N  V523OZUH8  
3           1850.0    Y    N       N   N  OJDYS3434  
4           1500.0    N    N       Y   Y  49RSM3UF6  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200994 entries, 0 to 200993
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   ---