In [None]:
"""
Student Performance Dataset Analysis
Dataset Information
Student Performance Dataset (StudentsPerformance.csv)

1000 students with academic performance data
8 columns: gender, race/ethnicity, parental level of education, lunch, test preparation course, math score, reading score, writing score
Score Range: 0-100 for each subject
Demographics: Male/Female, 5 ethnic groups, 6 education levels, standard/free lunch, test prep completed/none
"""

In [2]:
#Task 1.1: Data Loading and Exploration
import numpy as np
import pandas as pd

# Load the dataset
df = pd.read_csv('StudentsPerformance.csv')

# Your tasks:
# 1. Display first 10 rows
print(df.head(10))

# 2. Show dataset shape and info()
print(df.shape, df.info())

# 3. Display unique values in each categorical column
print(df.nunique())

# 4. Show basic statistics for all score columns
print(df.describe())




   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   
5  female        group B          associate's degree      standard   
6  female        group B                some college      standard   
7    male        group B                some college  free/reduced   
8    male        group D                 high school  free/reduced   
9  female        group B                 high school  free/reduced   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                  

In [None]:
# 1.2 Display first 10 rows#Task 1.2: NumPy Array Operations
# Convert score columns to numpy arrays
math_scores = df['math score'].values
reading_scores = df['reading score'].values
writing_scores = df['writing score'].values

# Your tasks:
# 1. Calculate mean, median, std, min, max for each subject using NumPy
print("Math Score\n")
print("Mean: ",np.mean(math_scores),
      "\nMedian: ", np.median(math_scores),
      "\nSTD: ", np.std(math_scores),
      "\nMin: ", np.min(math_scores),
      "\nMax: ", np.max(math_scores))
print("\nReading Score\n")
print("Mean: ",np.mean(reading_scores),
      "\nMedian: ", np.median(reading_scores),
      "\nTD: ", np.std(reading_scores),
      "\nMin: ", np.min(reading_scores),
      "\nMax: ", np.max(reading_scores))
print("\nWriting Score\n")
print("Mean: ",np.mean(writing_scores),
      "\nMedian: ", np.median(writing_scores),
      "\nTD: ", np.std(writing_scores),
      "\nMin: ", np.min(writing_scores),
      "\nMax: ", np.max(writing_scores))


# 2. Find total students who scored above 80 in math
math = math_scores > 80
print("\nFind total students scored above 80 in math =", np.sum(math))

# 3. Find total students who scored below 50 in any subject
write = writing_scores < 50
print("\nFind total students scored above 80 in math =", np.sum(write))


# 4. Calculate the overall average score across all three subjects
concatScore = np.concat([math_scores, writing_scores, reading_scores])
avg = np.mean(concatScore)
avgScore = np.round(avg, 2)
print("\nCalculate the overall average score across all three subjects =", avgScore)



# Expected output example:
# Math - Mean: 66.1, Median: 66.0, Std: 15.2
# Students with math > 80: 132
# Students with any score < 50: 178


Math Score

Mean:  66.089 
Median:  66.0 
STD:  15.155496659628149 
Min:  0 
Max:  100

Reading Score

Mean:  69.169 
Median:  70.0 
TD:  14.592890015346516 
Min:  17 
Max:  100

Writing Score

Mean:  68.054 
Median:  69.0 
TD:  15.188057281956768 
Min:  10 
Max:  100

Find total students scored above 80 in math = 176

Find total students scored above 80 in math = 114

Calculate the overall average score across all three subjects = 67.77


In [30]:
# Task 2.1: Identify and Handle Missing Data
# Check the dataset for any missing values
# Your tasks:
# 1. Check for null values in each column using .isnull().sum()
print(df.isnull().sum())

# 2. Check for any impossible values (scores > 100 or < 0)
columns_to_fix = ['math score', 'reading score', 'writing score']
concatScore = np.concat([math_scores, writing_scores, reading_scores])
impossibleValue = (concatScore > 100) & (concatScore < 0)
print("impossible values =", np.sum(impossibleValue))

# 3. Display summary of data quality
print(df.head())
print(df.tail())
print(df.info())
print(df.describe())


# Note: This dataset is clean, but let's verify!




gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64
impossible values = 0
   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  


In [25]:
#Task 2.2: Create and Fix Missing Value
# Artificially create missing data for practice
import numpy as np

# Your tasks:
# 1. Make a copy of the dataset: df_practice = df.copy()
df_practice = df.copy()

np.random.seed(42)  # For reproducible results
missing_math_idx = np.random.choice(df_practice.index, 50, replace=False)
missing_writing_idx = np.random.choice(df_practice.index, 30, replace=False)

# 2. Randomly set 50 math scores to np.nan
df_practice.loc[missing_math_idx, 'math score'] = np.nan
print("Created missing values:", df_practice.isnull().sum())


# 3. Randomly set 30 writing scores to np.nan
df_practice.loc[missing_writing_idx,'writing score']=np.nan
print("Created missing values:", df_practice.isnull().sum())

# 4. Fill missing math scores with the MEDIAN math score
df_practice['math score']=df['math score'].fillna(df['math score'].median())
print("Created missing values median:", df_practice.isnull().sum())

# 5. Fill missing writing scores with the MEAN writing score
df_practice['writing score']=df_practice['writing score'].fillna(df_practice['writing score'].mean())
print("Created missing values mean:", df_practice.isnull().sum())

# 6. Verify no missing values remain
print("Created missing values:", df_practice.isnull().sum())













Created missing values median: gender                          0
race/ethnicity                  0
parental level of education     0
lunch                           0
test preparation course         0
math score                      0
reading score                   0
writing score                  30
dtype: int64
Created missing values mean: gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64
Created missing values: gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64


In [23]:
#Task 3.1:Answer Simple Questions About Student Performance
# Your tasks:
# 1. Display first 5 rows using .head()
# 2. Display last 5 rows using .tail()
# 3. Show dataset shape using .shape
# 4. Show column names using .columns
# 5. Show data types using .dtypes
# 6. Show basic info using .info()

# Your code here:

print(df.head())
print(df.tail())
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.info())





   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  
     gender race/ethnicity parental level of education         lunch  \
995  female        group E             master's degree      standard   
996    male    

In [27]:
#Task 3.2: Basic Groupby Analysis
# Comprehensive groupby analysis
# Your tasks:

# 1. Group by 'gender' and calculate:
#    - Average scores in all subjects
#    - Count of students

genderGroupData = df.groupby('gender')[['math score', 'reading score', 'writing score']].agg(['mean', 'count'])
print(genderGroupData)


# 2. Group by 'test preparation course' and show:
#    - Mean scores for each subject
#    - Show which subject benefits most from test prep

testGroupData = df.groupby('test preparation course')[['math score', 'reading score', 'writing score']].agg(['mean'])
print(testGroupData)
best_math = df['math score'].max()
print(best_math)



# 3. Group by 'parental level of education' and calculate:
#    - Average math score for each education level
#    - Rank education levels by math performance



pl_education=df.groupby('parental level of education')['math score'].mean()
print(pl_education)
pdata=pd.DataFrame(pl_education)
print("Rank education levels by math performance")
pdata["Rank"] = pdata["math score"].rank(ascending=False).astype(int)
print(pdata.sort_values("Rank"))

# 4. Group by 'lunch' type and show:
#    - Average scores for each subject
#    - Count of students in each lunch category


lung_group=df.groupby('lunch')[['math score','reading score','writing score']].agg(['mean','count'])
print(lung_group)













       math score       reading score       writing score      
             mean count          mean count          mean count
gender                                                         
female  63.633205   518     72.608108   518     72.467181   518
male    68.728216   482     65.473029   482     63.311203   482
                        math score reading score writing score
                              mean          mean          mean
test preparation course                                       
completed                69.695531     73.893855     74.418994
none                     64.077882     66.534268     64.504673
100
parental level of education
associate's degree    67.882883
bachelor's degree     69.389831
high school           62.137755
master's degree       69.745763
some college          67.128319
some high school      63.497207
Name: math score, dtype: float64
Rank education levels by math performance
                             math score  Rank
parental level of ed

In [28]:
#Task 3.3: Use your data analysis skills to answer these easy questions:

# 1. Who performs better in math - males or females?
#    - Calculate average math score by gender
#    - Show the difference between male and female average


math=df.groupby('gender')['math score'].mean()
print("Average math score by gender: ",math)

print("The difference between male and female average",(math['male']-math['female']))
print("According the result male performs better than female :) ")

# 2. Which subject do students perform best in overall?
#    - Calculate the overall average for math, reading, and writing
#    - Rank the subjects from highest to lowest average score


m_average=df['math score'].mean()
print("Math Average: ", m_average)
r_average=df['reading score'].mean()
print("Reading Average: ", r_average)
w_average=df['writing score'].mean()
print("Writing Average: ", w_average)


# 3. What's the impact of parental education?
#    - Find the average total score for each parental education level
#    - Identify which education level leads to highest student performance

pl_education=df.groupby('parental level of education')['math score'].mean()
print(pl_education)

highest_performance = df['parental level of education'].max()
student_highest = df[df['parental level of education'] == highest_performance]
print("highest student performance:", highest_performance)

print(" education level leads to highest student performance:")
print(student_highest[['parental level of education']])

print()


Average math score by gender:  gender
female    63.633205
male      68.728216
Name: math score, dtype: float64
The difference between male and female average 5.095011134430216
According the result male performs better than female :) 
Math Average:  66.089
Reading Average:  69.169
Writing Average:  68.054
parental level of education
associate's degree    67.882883
bachelor's degree     69.389831
high school           62.137755
master's degree       69.745763
some college          67.128319
some high school      63.497207
Name: math score, dtype: float64
highest student performance: some high school
 education level leads to highest student performance:
    parental level of education
15             some high school
17             some high school
23             some high school
37             some high school
54             some high school
..                          ...
982            some high school
984            some high school
987            some high school
988            some 