# Student Habits vs Academic Performance

![Research Framework](../assets/student.jpeg)

### Import Necessary Libraries

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

%matplotlib inline

### Load Data

In [50]:
df = pd.read_csv('../data/student_habits_performance.csv')
df.head()

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3
3,S1003,23,Female,1.0,3.9,1.0,No,71.0,9.2,Poor,4,Master,Good,1,Yes,26.8
4,S1004,19,Female,5.0,4.4,0.5,No,90.9,4.9,Fair,3,Master,Good,1,No,66.4


In [51]:
df.tail()

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
995,S1995,21,Female,2.6,0.5,1.6,No,77.0,7.5,Fair,2,High School,Good,6,Yes,76.1
996,S1996,17,Female,2.9,1.0,2.4,Yes,86.0,6.8,Poor,1,High School,Average,6,Yes,65.9
997,S1997,20,Male,3.0,2.6,1.3,No,61.9,6.5,Good,5,Bachelor,Good,9,Yes,64.4
998,S1998,24,Male,5.4,4.1,1.1,Yes,100.0,7.6,Fair,0,Bachelor,Average,1,No,69.7
999,S1999,19,Female,4.3,2.9,1.9,No,89.4,7.1,Good,2,Bachelor,Average,8,No,74.9


### Check the shape of data (Rows, Columns)

### Analaysis The Data

In [52]:
df.shape

(1000, 16)

### Check for data type of columns to verify the data format

In [53]:
df.columns

Index(['student_id', 'age', 'gender', 'study_hours_per_day',
       'social_media_hours', 'netflix_hours', 'part_time_job',
       'attendance_percentage', 'sleep_hours', 'diet_quality',
       'exercise_frequency', 'parental_education_level', 'internet_quality',
       'mental_health_rating', 'extracurricular_participation', 'exam_score'],
      dtype='object')

### Check Structure of Data
- verify columns format
- check for non-null values
- memory usage

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   object 
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   object 
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   object 
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   object 
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       909 non-null    object 
 12  internet_quality               1000 non-null   ob

here we can see:

dtypes: float64 = 6, int64 = 3, object = 7

In [55]:
df.index

RangeIndex(start=0, stop=1000, step=1)

In [56]:
df.describe()

Unnamed: 0,age,study_hours_per_day,social_media_hours,netflix_hours,attendance_percentage,sleep_hours,exercise_frequency,mental_health_rating,exam_score
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.498,3.5501,2.5055,1.8197,84.1317,6.4701,3.042,5.438,69.6015
std,2.3081,1.46889,1.172422,1.075118,9.399246,1.226377,2.025423,2.847501,16.888564
min,17.0,0.0,0.0,0.0,56.0,3.2,0.0,1.0,18.4
25%,18.75,2.6,1.7,1.0,78.0,5.6,1.0,3.0,58.475
50%,20.0,3.5,2.5,1.8,84.4,6.5,3.0,5.0,70.5
75%,23.0,4.5,3.3,2.525,91.025,7.3,5.0,8.0,81.325
max,24.0,8.3,7.2,5.4,100.0,10.0,6.0,10.0,100.0


### Check The Missing values

In [57]:
df.isnull().sum()

student_id                        0
age                               0
gender                            0
study_hours_per_day               0
social_media_hours                0
netflix_hours                     0
part_time_job                     0
attendance_percentage             0
sleep_hours                       0
diet_quality                      0
exercise_frequency                0
parental_education_level         91
internet_quality                  0
mental_health_rating              0
extracurricular_participation     0
exam_score                        0
dtype: int64

In [58]:
df.isnull().sum().value_counts()

0     15
91     1
Name: count, dtype: int64

### Let's check duplicate values

In [59]:
df.duplicated().sum()

np.int64(0)

There is no duplicate values

In [60]:
# 1. Detection
null_counts = df.isnull().sum()
duplicate_count = df.duplicated().sum()

print(f"Duplicates found: {duplicate_count}")
print("-" * 30)
print(f"Null values per column:\n{null_counts[null_counts > 0]}")

Duplicates found: 0
------------------------------
Null values per column:
parental_education_level    91
dtype: int64


We got 91 missing values in "parental_education_level" columns

1. **Should you Drop or Fill?**

**Drop:** Generally, we only drop if the missing data is a tiny percentage (< 2%) or if the entire row is empty. Since we have 91 missing values, dropping them might lose valuable information from the other columns (like ```study_hours``` or ```exam_score```).

**Fill (Imputation)**: This is usually the better choice. It keeps our dataset size intact while making an "educated guess" about what the missing value should be.

### This is for ```src/step/ingest.py```

In [71]:
import pandas as pd
import logging
from pathlib import Path

logger = logging.getLogger(__name__)

DATA_SOURCE = "../data/student_habits_performance.csv"

def ingest_data(DATA_SOURCE: str) -> pd.DataFrame:
    """
    Ingests the raw CSV data from the source path defined in config.
    """
    # Convert string path to Path object for robust checking
    data_path = Path(DATA_SOURCE)

    # Check if file exists
    if not data_path.exists():
        logger.error(f"Data file not found at: {data_path}")
        raise FileNotFoundError(f"Missing input data: {data_path}")

    logger.info(f"==> Ingesting data from: {data_path}")
    
    # Loading Process
    try:
        df = pd.read_csv(data_path)
        logger.info(f"==> Successfully loaded {len(df)} rows and {len(df.columns)} columns.")
        return df
        
    except pd.errors.EmptyDataError:
        logger.error("The target CSV file is empty.")
        raise
    except Exception as e:
        logger.error(f"Unexpected error during ingestion: {e}")
        raise
    

if __name__ == "__main__":
    df = ingest_data(DATA_SOURCE=DATA_SOURCE)
    print("Ingestion completed.")
    print(df.head())
    print(f"Data shape: {df.shape}" if df is not None else "Ingestion failed.")


Ingestion completed.
  student_id  age  gender  study_hours_per_day  social_media_hours  \
0      S1000   23  Female                  0.0                 1.2   
1      S1001   20  Female                  6.9                 2.8   
2      S1002   21    Male                  1.4                 3.1   
3      S1003   23  Female                  1.0                 3.9   
4      S1004   19  Female                  5.0                 4.4   

   netflix_hours part_time_job  attendance_percentage  sleep_hours  \
0            1.1            No                   85.0          8.0   
1            2.3            No                   97.3          4.6   
2            1.3            No                   94.8          8.0   
3            1.0            No                   71.0          9.2   
4            0.5            No                   90.9          4.9   

  diet_quality  exercise_frequency parental_education_level internet_quality  \
0         Fair                   6                   Mast