# Student Academic Trends — Full Analysis

This notebook performs a complete analysis of the Kaggle dataset **Analyzing Student Academic Trends**. It is designed to run on Kaggle (or locally after downloading the CSV). Steps included:
1. Import libraries
2. Load dataset
3. Data inspection & cleaning
4. Exploratory Data Analysis (9 combined plots using Matplotlib & Seaborn)
5. Train multiple ML models (Random Forest, Ridge, Lasso, Decision Tree, KNN, Linear Regression, Gradient Boosting, SVR)
6. Model evaluation and comparison (RMSE, MAE, R2) with 4 combined plots (barplot, lineplot, boxplot, ROC-like curves)

> **Note:** The notebook tries to load the dataset from Kaggle's input folder. If you run this locally, replace the `DATA_PATH` variable with the CSV file path.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (10,6)


In [None]:
DATA_PATH = '/kaggle/input/analyzing-student-academic-trends/analyzing_student_academic_trends.csv'
try:
    df = pd.read_csv(DATA_PATH)
    print('Loaded dataset from Kaggle input')
except Exception as e:
    print('Could not load from Kaggle path — falling back to attempting local file or creating synthetic sample. Error:', e)
    try:
        df = pd.read_csv('analyzing_student_academic_trends.csv')
        print('Loaded dataset from local working directory')
    except Exception:
        print('Local file not found — creating a synthetic sample (200 rows) for demonstration')
        np.random.seed(42)
        n = 200
        df = pd.DataFrame({
            'hours_studied': np.random.uniform(0, 10, n),
            'sleep_hours': np.random.uniform(4, 10, n),
            'attendance_percent': np.random.uniform(50, 100, n),
            'previous_scores': np.random.uniform(40, 100, n)
        })
        df['exam_score'] = (5*df['hours_studied'] + 0.3*df['attendance_percent'] + 0.4*df['previous_scores']
                            + 1.5*(df['sleep_hours']-7) + np.random.normal(0,5,n)).clip(0,100)
        df.reset_index(drop=True, inplace=True)
print('\nDataset shape:', df.shape)
display(df.head())


## Quick Data Inspection & Cleaning
- Check dtypes, missing values, basic statistics


In [None]:
print(df.dtypes)
print('\nMissing values:')
print(df.isnull().sum())
display(df.describe())
required = ['hours_studied','sleep_hours','attendance_percent','previous_scores','exam_score']
for col in required:
    if col not in df.columns:
        raise ValueError(f"Required column '{col}' not found in dataset.")


## Exploratory Data Analysis (9 combined plots)


In [None]:
df['study_cat'] = pd.cut(df['hours_studied'], bins=[-0.01,2.5,5,7.5,10], labels=['Very Low','Low','Medium','High'])
df['sleep_cat'] = pd.cut(df['sleep_hours'], bins=[3.9,6,7.5,10], labels=['Short','Normal','Long'])
fig, axes = plt.subplots(3,3, figsize=(18,14))
sns.scatterplot(x='hours_studied', y='exam_score', data=df, ax=axes[0,0])
sns.regplot(x='hours_studied', y='exam_score', data=df, scatter=False, ax=axes[0,0], color='red')
axes[0,0].set_title('Hours Studied vs Exam Score (scatter + reg)')

sns.scatterplot(x='attendance_percent', y='exam_score', data=df, ax=axes[0,1])
sns.regplot(x='attendance_percent', y='exam_score', data=df, scatter=False, ax=axes[0,1], color='red')
axes[0,1].set_title('Attendance vs Exam Score (scatter + reg)')

sns.scatterplot(x='previous_scores', y='exam_score', data=df, ax=axes[0,2])
sns.regplot(x='previous_scores', y='exam_score', data=df, scatter=False, ax=axes[0,2], color='red')
axes[0,2].set_title('Previous Scores vs Exam Score (scatter + reg)')

sns.histplot(df['hours_studied'], kde=True, ax=axes[1,0])
ax2 = axes[1,0].twinx()
sns.boxplot(x='hours_studied', data=df, ax=ax2, width=0.15)
axes[1,0].set_title('Hours Studied: Hist + Box')
ax2.set_yticks([])

sns.histplot(df['exam_score'], kde=True, ax=axes[1,1])
ax2 = axes[1,1].twinx()
sns.boxplot(x='exam_score', data=df, ax=ax2, width=0.15)
axes[1,1].set_title('Exam Score: Hist + Box')
ax2.set_yticks([])

order = df.groupby('study_cat')['exam_score'].mean().sort_values().index
sns.barplot(x='study_cat', y='exam_score', data=df, order=order, ax=axes[1,2])
sns.stripplot(x='study_cat', y='exam_score', data=df, order=order, ax=axes[1,2], color='black', alpha=0.5)
axes[1,2].set_title('Avg Exam by Study Category (bar + strip)')

avg_by_hours = df.groupby(df['hours_studied'].round())['exam_score'].mean()
sns.lineplot(x=avg_by_hours.index, y=avg_by_hours.values, marker='o', ax=axes[2,0])
sns.scatterplot(x='hours_studied', y='exam_score', data=df, alpha=0.3, ax=axes[2,0])
axes[2,0].set_title('Avg Exam by Rounded Hours (line + scatter)')

sns.violinplot(x='sleep_cat', y='exam_score', data=df, ax=axes[2,1])
sns.swarmplot(x='sleep_cat', y='exam_score', data=df, ax=axes[2,1], color='k', alpha=0.6)
axes[2,1].set_title('Exam Score by Sleep Category (violin + swarm)')

corr = df[['hours_studied','sleep_hours','attendance_percent','previous_scores','exam_score']].corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', ax=axes[2,2])
axes[2,2].set_title('Correlation Matrix')

plt.tight_layout()
plt.show()


## Machine Learning Models
Train multiple models on selected features and evaluate performance.

In [None]:
X = df[['hours_studied','sleep_hours','attendance_percent','previous_scores']]
y = df['exam_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'KNN': KNeighborsRegressor(),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'SVR': SVR()
}
metrics = {}
preds = {}
for name, model in models.items():
    if name in ['SVR','KNN']:
        model.fit(X_train_scaled, y_train)
        p = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        p = model.predict(X_test)
    preds[name] = p
    mse = mean_squared_error(y_test, p)
    mae = mean_absolute_error(y_test, p)
    r2 = r2_score(y_test, p)
    metrics[name] = {'MSE': mse, 'MAE': mae, 'RMSE': np.sqrt(mse), 'R2': r2}

metrics_df = pd.DataFrame(metrics).T.sort_values('RMSE')
display(metrics_df)


## Model Comparison — Combined Plots (bar, line, box, ROC-like)


In [None]:
model_names = list(metrics.keys())
r2_vals = [metrics[m]['R2'] for m in model_names]
rmse_vals = [metrics[m]['RMSE'] for m in model_names]

abs_err_list = []
for m in model_names:
    ae = np.abs(y_test - preds[m])
    tmp = pd.DataFrame({'Model': m, 'AbsError': ae})
    abs_err_list.append(tmp)
abs_err_df = pd.concat(abs_err_list, ignore_index=True)

threshold = y_test.mean()
fig, axes = plt.subplots(2,2, figsize=(16,12))
sns.barplot(x=r2_vals, y=model_names, ax=axes[0,0])
axes[0,0].set_title('R2 by Model')

sns.lineplot(x=model_names, y=rmse_vals, marker='o', ax=axes[0,1])
axes[0,1].set_title('RMSE by Model')
axes[0,1].set_xticklabels(model_names, rotation=45)

sns.boxplot(x='Model', y='AbsError', data=abs_err_df, ax=axes[1,0])
axes[1,0].set_title('Absolute Error Distribution')
axes[1,0].tick_params(axis='x', rotation=45)

for m in model_names:
    fpr, tpr, _ = roc_curve(y_test >= threshold, preds[m] >= threshold)
    roc_auc = auc(fpr, tpr)
    axes[1,1].plot(fpr, tpr, label=f"{m} (AUC={roc_auc:.2f})")
axes[1,1].plot([0,1],[0,1],'k--')
axes[1,1].set_title('ROC-like Curves (pass = score >= mean)')
axes[1,1].set_xlabel('FPR')
axes[1,1].set_ylabel('TPR')
axes[1,1].legend(loc='lower right')

plt.suptitle('Model Comparison - 4 Plots Combined', fontsize=16)
plt.tight_layout(rect=[0,0,1,0.96])
plt.show()


### Optional: save the best model
Uncomment and run if you want to save the best model pipeline locally.

In [None]:
# import joblib
# best = metrics_df.index[0]
# joblib.dump({'model': models[best], 'scaler': scaler}, 'best_model_pipeline.joblib')
# print('Saved best_model_pipeline.joblib')
