Load Dataset.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Optional for correlation/stats
import numpy as np

# Load the dataset
# Assuming 'student-mat.csv' is in the same directory as the notebook
df = pd.read_csv("student-mat.csv")

# Display the first few rows to confirm successful loading
print("Initial DataFrame Head:")
print(df.head())
print("-" * 50)

FileNotFoundError: [Errno 2] No such file or directory: 'student-mat.csv'

Explore & Clean Data.

In [2]:
# Inspect dataset shape
print("Dataset Shape (rows, columns):", df.shape) # [cite: 19]

# Inspect dtypes (Data Types)
print("\nDataset dtypes:")
print(df.dtypes) # [cite: 19]
print("-" * 50)

# Check for missing values (.isnull())
print("Missing values count per column:")
print(df.isnull().sum()) # [cite: 17]

# Remove duplicates
initial_rows = df.shape[0]
df.drop_duplicates(inplace=True) # [cite: 18]
rows_after_cleaning = df.shape[0]

print(f"\nDuplicates removed: {initial_rows - rows_after_cleaning} rows.")
print(f"New DataFrame Shape: {df.shape}")
print("-" * 50)

NameError: name 'df' is not defined

Analysis Questions.

Average final grade (G3)

In [None]:
average_g3 = df['G3'].mean() # [cite: 24]
print(f"1. Average Final Grade (G3): {average_g3:.2f}")

How many students scored above 15?

In [None]:
students_above_15 = df[df['G3'] > 15].shape[0] # [cite: 25]
print(f"2. Number of students who scored above 15 (G3 > 15): {students_above_15}")

Is study time correlated with performance?

In [None]:
# Calculate the correlation between 'studytime' and 'G3' (final grade)
study_performance_corr = df['studytime'].corr(df['G3']) # [cite: 26, 39]
print(f"3. Correlation between Study Time and Final Grade (G3): {study_performance_corr:.3f}")

Which gender performs better on average?

In [None]:
# Group by 'sex' and calculate the mean of 'G3'
gender_performance = df.groupby('sex')['G3'].mean().sort_values(ascending=False) # [cite: 27]
print("\n4. Average G3 by Gender:")
print(gender_performance)

# Find the better performing gender
better_gender = gender_performance.index[0]
print(f"\nConclusion: {better_gender} performs better on average.")
print("-" * 50)

Visualizations

Histogram of grades (G3)

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df['G3'], bins=10, kde=True, color='skyblue') # [cite: 29]
plt.title("Distribution of Final Grades (G3)")
plt.xlabel("Final Grade (G3)")
plt.ylabel("Frequency")
plt.show()

Scatterplot: study time vs grades (G3)

In [None]:
plt.figure(figsize=(9, 6))
# 'studytime' is ordinal (1 to 4) but scatterplot helps visualize the spread
sns.scatterplot(x=df['studytime'], y=df['G3'], alpha=0.6, color='darkgreen') # [cite: 30]
plt.title("Study Time vs Final Grades (G3)")
plt.xlabel("Study Time (1: <2h, 2: 2-5h, 3: 5-10h, 4: >10h)")
plt.ylabel("Final Grade (G3)")
plt.xticks(ticks=[1, 2, 3, 4]) # Set custom ticks for clarity
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

Bar chart: male vs female average score (G3)

In [None]:
plt.figure(figsize=(7, 5))
sns.barplot(x=df['sex'], y=df['G3'], data=df, errorbar=None, palette='pastel') # [cite: 31]
plt.title("Average Final Grade (G3) by Gender")
plt.xlabel("Gender ('F' = Female, 'M' = Male)")
plt.ylabel("Average Final Grade (G3)")
plt.show()