In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:

df = pd.read_excel("student_habits_performance.xlsx")

In [3]:
# NUmber of row in the missing in the columns.Answer provided in the  ascending order.
student_habits_performance = df.isnull().sum().sort_values(ascending=False)

In [4]:
# Ascending=False = sort from most nulls to fewest
# Ascending=True = sort from fewest nulls to most
print(student_habits_performance)

parental_education_level         91
student_id                        0
age                               0
gender                            0
study_hours_per_day               0
social_media_hours                0
netflix_hours                     0
part_time_job                     0
attendance_percentage             0
sleep_hours                       0
diet_quality                      0
exercise_frequency                0
internet_quality                  0
mental_health_rating              0
extracurricular_participation     0
exam_score                        0
dtype: int64


In [5]:
# The % of missing values in each column, shown to 2 decimal places
for column in df.columns:
    percentage = df[column].isnull().mean() * 100
    print(f"{column}: {percentage:.2f}%")

student_id: 0.00%
age: 0.00%
gender: 0.00%
study_hours_per_day: 0.00%
social_media_hours: 0.00%
netflix_hours: 0.00%
part_time_job: 0.00%
attendance_percentage: 0.00%
sleep_hours: 0.00%
diet_quality: 0.00%
exercise_frequency: 0.00%
parental_education_level: 9.10%
internet_quality: 0.00%
mental_health_rating: 0.00%
extracurricular_participation: 0.00%
exam_score: 0.00%


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   object 
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   object 
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   object 
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   object 
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       909 non-null    object 
 12  internet_quality               1000 non-null   ob

In [7]:
# Removing 'parental_education_level' because it may introduce bias,
# adds pressure on students, and has 9% missing values.
df = df.drop(columns=['parental_education_level'], errors='ignore')

In [8]:
if 'parental_education_level' in df.columns:
    print("Column still exists.")
else:
    print("Column successfully deleted.")

Column successfully deleted.


In [9]:
#Trimming all the spaces that i can not see with my naked eye.
df.columns = df.columns.str.strip()

In [10]:
#Checking for any N/A values in the dataset.
if df.isnull().values.any():
    print("The are still N/A values in the dataset.")

In [11]:
#Where in the dataset are the missing values?
df.isnull().sum()

student_id                       0
age                              0
gender                           0
study_hours_per_day              0
social_media_hours               0
netflix_hours                    0
part_time_job                    0
attendance_percentage            0
sleep_hours                      0
diet_quality                     0
exercise_frequency               0
internet_quality                 0
mental_health_rating             0
extracurricular_participation    0
exam_score                       0
dtype: int64

In [12]:
# Finding duplicates in the  the dataset.
duplicates = df.duplicated().sum()
print(f"Number of duplicated rows: {duplicates}")

Number of duplicated rows: 0


In [13]:
df.duplicated().any()

np.False_

In [14]:
df.describe()


Unnamed: 0,age,study_hours_per_day,social_media_hours,netflix_hours,attendance_percentage,sleep_hours,exercise_frequency,mental_health_rating,exam_score
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.498,3.5501,2.5055,1.8197,84.1317,6.4701,3.042,5.438,69.6015
std,2.3081,1.46889,1.172422,1.075118,9.399246,1.226377,2.025423,2.847501,16.888564
min,17.0,0.0,0.0,0.0,56.0,3.2,0.0,1.0,18.4
25%,18.75,2.6,1.7,1.0,78.0,5.6,1.0,3.0,58.475
50%,20.0,3.5,2.5,1.8,84.4,6.5,3.0,5.0,70.5
75%,23.0,4.5,3.3,2.525,91.025,7.3,5.0,8.0,81.325
max,24.0,8.3,7.2,5.4,100.0,10.0,6.0,10.0,100.0


In [15]:
# Validate data ranges.
def validate_range(df, column, min_val, max_val):
    return df[(df[column] < min_val) | (df[column] > max_val)]

print(validate_range(df, 'age', 17, 24))
print(validate_range(df, 'study_hours_per_day', 0, 9))
print(validate_range(df, 'social_media_hours', 0, 8))
print(validate_range(df, 'netflix_hours', 0, 6))
print(validate_range(df, 'attendance_percentage', 8, 100))
print(validate_range(df, 'sleep_hours', 0, 10))
print(validate_range(df, 'exercise_frequency', 2, 8))

Empty DataFrame
Columns: [student_id, age, gender, study_hours_per_day, social_media_hours, netflix_hours, part_time_job, attendance_percentage, sleep_hours, diet_quality, exercise_frequency, internet_quality, mental_health_rating, extracurricular_participation, exam_score]
Index: []
Empty DataFrame
Columns: [student_id, age, gender, study_hours_per_day, social_media_hours, netflix_hours, part_time_job, attendance_percentage, sleep_hours, diet_quality, exercise_frequency, internet_quality, mental_health_rating, extracurricular_participation, exam_score]
Index: []
Empty DataFrame
Columns: [student_id, age, gender, study_hours_per_day, social_media_hours, netflix_hours, part_time_job, attendance_percentage, sleep_hours, diet_quality, exercise_frequency, internet_quality, mental_health_rating, extracurricular_participation, exam_score]
Index: []
Empty DataFrame
Columns: [student_id, age, gender, study_hours_per_day, social_media_hours, netflix_hours, part_time_job, attendance_percentage, 

In [16]:
# mean/ median per mental health grouping-

class StudentAnalyzer:

    def __init__(self, data: pd.DataFrame):
        expected = {'mental_health_ratings', 'study_hours_per_day'}
        if not expected.issubset(data.columns):
            missing = expected - set(data.columns)
            raise ValueError(f"Missing required columns: {missing}")
        self.data = data.copy()


    def mean_study_by_mental_health(self) -> pd.Series:
        return (self.data
                  .groupby('mental_health_ratings')['study_hours_per_day']
                  .mean()
                  .sort_index())

    def median_study_by_mental_health(self) -> pd.Series:
        return (self.data
                  .groupby('mental_health_ratings')['study_hours_per_day']
                  .median()
                  .sort_index())
if __name__ == "__main__":
    np.random.seed(42)                          # makes demo reproducible
    df = pd.DataFrame({
        "mental_health_ratings": np.random.randint(1, 11, size=1000),
        "study_hours_per_day":  np.random.uniform(0.0, 7.4, size=1000).round(1)
    })

    analyzer = StudentAnalyzer(df)
 
    print("MEAN study hours by mental-health rating:")
    print(analyzer.mean_study_by_mental_health(), end="\n\n")

    print("MEDIAN study hours by mental-health rating:")
    print(analyzer.median_study_by_mental_health())

MEAN study hours by mental-health rating:
mental_health_ratings
1     3.816102
2     4.044578
3     3.946364
4     3.445745
5     3.692523
6     3.406250
7     3.672340
8     3.458000
9     4.025275
10    3.707477
Name: study_hours_per_day, dtype: float64

MEDIAN study hours by mental-health rating:
mental_health_ratings
1     3.90
2     4.30
3     4.40
4     3.40
5     3.70
6     3.30
7     3.75
8     3.50
9     4.30
10    3.50
Name: study_hours_per_day, dtype: float64


In [17]:
# Averages
average_sleep = df['sleep_hours'].mean()
average_score = df['exam_score'].mean()

print("Average sleep hours:", average_sleep)
print("Average exam score:", average_score)

KeyError: 'sleep_hours'

In [None]:
#  --- Correlation using pandas (easiest) ---
r = df['sleep_hours'].corr(df['exam_score'])
print("Correlation r:", r)

In [None]:
#We use the IQR to find outliers.
#IQR = Q3 − Q1
# Quartiles 
Q1 = df["social_media_hours"].quantile(0.25)
Q3 = df["social_media_hours"].quantile(0.75)

# IQR
IQR = Q3 - Q1

# Limits
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

In [None]:
# Outliers
outliers = df[
    (df["social_media_hours"] < lower_limit) |
    (df["social_media_hours"] > upper_limit)
]

print("Outlier limits:", lower_limit, "to", upper_limit)
print("Number of outliers:", len(outliers))

In [None]:
import os, glob
print("Notebook folder →", os.getcwd())

In [None]:
import pandas as pd

file_path = "student analysis/student_habits_performance.xlsx"
try:
    df = pd.read_excel(file_path)
    print("File loaded successfully.")
    print(df.head())  # see if data looks good
except FileNotFoundError:
    print("File not found! Check the path again.")

In [None]:
file_path = "student_habits_performance.xlsx"
df = pd.read_excel(file_path)
print(df.head())

In [None]:


plt.figure(figsize=(8, 5))
sns.histplot(df['study_hours_per_day'], bins=10, kde=True, color='skyblue')

plt.title('Distribution of Study Hours Per Day')
plt.xlabel('Study Hours Per Day')
plt.ylabel('Number of Students')

plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['sleep_hours'], y=df['exam_score'])

plt.title('Sleep Hours vs. Exam Score')
plt.xlabel('Sleep Hours')
plt.ylabel('Exam Score')

plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['diet_quality'], y=df['exam_score'])

plt.title('Exam Scores by Diet Quality')
plt.xlabel('Diet Quality')
plt.ylabel('Exam Score')
plt.grid(True, axis='y')
plt.show()