# Student Grade Prediction - Data Exploration

This notebook explores the UCI Student Performance dataset to understand the factors
that influence student academic performance.

## Dataset Information
- **Source**: UCI Machine Learning Repository
- **Target**: G3 (final grade, 0-20 scale)
- **Features**: Demographic, social, and academic factors

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Load the Dataset

In [None]:
# Load student performance data
data_path = Path('../data/student_mat.csv')

if data_path.exists():
    df = pd.read_csv(data_path, sep=';')
    print(f"Dataset loaded: {df.shape[0]} students, {df.shape[1]} features")
else:
    print("Please download the dataset from UCI ML Repository")
    print("URL: https://archive.ics.uci.edu/ml/datasets/Student+Performance")

In [None]:
# Display first few rows
df.head()

In [None]:
# Dataset info
df.info()

## 2. Data Overview

In [None]:
# Statistical summary
df.describe()

In [None]:
# Check for missing values
missing = df.isnull().sum()
print("Missing values per column:")
print(missing[missing > 0] if missing.sum() > 0 else "No missing values!")

## 3. Target Variable Analysis (G3 - Final Grade)

In [None]:
# Distribution of final grades
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['G3'], bins=20, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].axvline(df['G3'].mean(), color='red', linestyle='--', label=f'Mean: {df["G3"].mean():.2f}')
axes[0].axvline(10, color='orange', linestyle='--', label='Passing threshold (10)')
axes[0].set_xlabel('Final Grade (G3)')
axes[0].set_ylabel('Number of Students')
axes[0].set_title('Distribution of Final Grades')
axes[0].legend()

# Box plot
axes[1].boxplot(df['G3'])
axes[1].set_ylabel('Final Grade (G3)')
axes[1].set_title('Box Plot of Final Grades')

plt.tight_layout()
plt.show()

# Statistics
print(f"\nGrade Statistics:")
print(f"Mean: {df['G3'].mean():.2f}")
print(f"Median: {df['G3'].median():.2f}")
print(f"Std Dev: {df['G3'].std():.2f}")
print(f"Passing Rate: {(df['G3'] >= 10).mean()*100:.1f}%")

## 4. Feature Correlations

In [None]:
# Correlation matrix for numeric features
numeric_cols = df.select_dtypes(include=[np.number]).columns
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdBu_r', center=0)
plt.title('Correlation Matrix of Numeric Features')
plt.tight_layout()
plt.show()

In [None]:
# Top correlations with target (G3)
correlations = corr_matrix['G3'].drop('G3').sort_values(key=abs, ascending=False)

plt.figure(figsize=(10, 6))
colors = ['green' if x > 0 else 'red' for x in correlations]
plt.barh(correlations.index, correlations.values, color=colors, alpha=0.7)
plt.xlabel('Correlation with Final Grade (G3)')
plt.title('Feature Correlations with Final Grade')
plt.axvline(0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

print("\nTop 5 Positive Correlations:")
print(correlations[correlations > 0].head())
print("\nTop 5 Negative Correlations:")
print(correlations[correlations < 0].head())

## 5. Academic Factors Analysis

In [None]:
# Grade progression (G1 -> G2 -> G3)
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].scatter(df['G1'], df['G3'], alpha=0.5)
axes[0].plot([0, 20], [0, 20], 'r--', label='Perfect correlation')
axes[0].set_xlabel('First Period Grade (G1)')
axes[0].set_ylabel('Final Grade (G3)')
axes[0].set_title('G1 vs G3')
axes[0].legend()

axes[1].scatter(df['G2'], df['G3'], alpha=0.5, color='green')
axes[1].plot([0, 20], [0, 20], 'r--', label='Perfect correlation')
axes[1].set_xlabel('Second Period Grade (G2)')
axes[1].set_ylabel('Final Grade (G3)')
axes[1].set_title('G2 vs G3')
axes[1].legend()

axes[2].scatter(df['G1'], df['G2'], alpha=0.5, color='purple')
axes[2].plot([0, 20], [0, 20], 'r--', label='Perfect correlation')
axes[2].set_xlabel('First Period Grade (G1)')
axes[2].set_ylabel('Second Period Grade (G2)')
axes[2].set_title('G1 vs G2')
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Study time and failures impact
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Study time vs Grade
df.boxplot(column='G3', by='studytime', ax=axes[0])
axes[0].set_xlabel('Study Time (1=<2hrs, 2=2-5hrs, 3=5-10hrs, 4=>10hrs)')
axes[0].set_ylabel('Final Grade (G3)')
axes[0].set_title('Study Time vs Final Grade')
plt.suptitle('')

# Failures vs Grade
df.boxplot(column='G3', by='failures', ax=axes[1])
axes[1].set_xlabel('Number of Past Failures')
axes[1].set_ylabel('Final Grade (G3)')
axes[1].set_title('Past Failures vs Final Grade')
plt.suptitle('')

plt.tight_layout()
plt.show()

## 6. Demographic Factors Analysis

In [None]:
# Gender and Age analysis
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Gender
df.boxplot(column='G3', by='sex', ax=axes[0])
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Final Grade (G3)')
axes[0].set_title('Gender vs Final Grade')
plt.suptitle('')

# Age
df.boxplot(column='G3', by='age', ax=axes[1])
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Final Grade (G3)')
axes[1].set_title('Age vs Final Grade')
plt.suptitle('')

plt.tight_layout()
plt.show()

In [None]:
# Parent education impact
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

df.boxplot(column='G3', by='Medu', ax=axes[0])
axes[0].set_xlabel("Mother's Education (0=none, 4=higher ed)")
axes[0].set_ylabel('Final Grade (G3)')
axes[0].set_title("Mother's Education vs Final Grade")
plt.suptitle('')

df.boxplot(column='G3', by='Fedu', ax=axes[1])
axes[1].set_xlabel("Father's Education (0=none, 4=higher ed)")
axes[1].set_ylabel('Final Grade (G3)')
axes[1].set_title("Father's Education vs Final Grade")
plt.suptitle('')

plt.tight_layout()
plt.show()

## 7. Social Factors Analysis

In [None]:
# Alcohol consumption impact
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

df.boxplot(column='G3', by='Dalc', ax=axes[0])
axes[0].set_xlabel('Weekday Alcohol Consumption (1=low, 5=high)')
axes[0].set_ylabel('Final Grade (G3)')
axes[0].set_title('Weekday Alcohol vs Final Grade')
plt.suptitle('')

df.boxplot(column='G3', by='Walc', ax=axes[1])
axes[1].set_xlabel('Weekend Alcohol Consumption (1=low, 5=high)')
axes[1].set_ylabel('Final Grade (G3)')
axes[1].set_title('Weekend Alcohol vs Final Grade')
plt.suptitle('')

plt.tight_layout()
plt.show()

In [None]:
# Absences impact
plt.figure(figsize=(10, 6))
plt.scatter(df['absences'], df['G3'], alpha=0.5)
z = np.polyfit(df['absences'], df['G3'], 1)
p = np.poly1d(z)
plt.plot(df['absences'].sort_values(), p(df['absences'].sort_values()), 'r--', linewidth=2)
plt.xlabel('Number of Absences')
plt.ylabel('Final Grade (G3)')
plt.title('Absences vs Final Grade')
plt.show()

print(f"Correlation between absences and G3: {df['absences'].corr(df['G3']):.3f}")

## 8. Key Insights Summary

In [None]:
print("=" * 60)
print("KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 60)

print("\n1. TARGET VARIABLE (G3 - Final Grade):")
print(f"   - Mean grade: {df['G3'].mean():.2f}")
print(f"   - Passing rate (>=10): {(df['G3'] >= 10).mean()*100:.1f}%")
print(f"   - Failing students: {(df['G3'] < 10).sum()} students")

print("\n2. STRONGEST PREDICTORS:")
print(f"   - G2 (2nd period grade): r = {corr_matrix['G3']['G2']:.3f}")
print(f"   - G1 (1st period grade): r = {corr_matrix['G3']['G1']:.3f}")
print(f"   - Failures: r = {corr_matrix['G3']['failures']:.3f}")

print("\n3. KEY OBSERVATIONS:")
print("   - Previous grades (G1, G2) are strong predictors of final grade")
print("   - Past failures negatively impact performance")
print("   - Higher study time correlates with better grades")
print("   - Parent education level has positive correlation")
print("   - Alcohol consumption shows negative correlation")

print("\n4. RECOMMENDATIONS FOR MODELING:")
print("   - Use G1 and G2 as key features")
print("   - Create engineered features (study efficiency, family education score)")
print("   - Consider at-risk student identification")
print("   - Use regression models for grade prediction")