In [1]:
import pandas as pd
import numpy as np

In [5]:
import os

# Print the current working directory
print("Current Working Directory: ", os.getcwd())

# Change the current working directory
os.chdir('C:/Users/naren/Desktop/DS/Projects/MLPROJECT')
print("New Working Directory: ", os.getcwd())

Current Working Directory:  c:\Users\naren\Desktop\DS\Projects\MLPROJECT\notebooks\eda
New Working Directory:  C:\Users\naren\Desktop\DS\Projects\MLPROJECT


In [7]:
# Define the path to the CSV file
data_path = 'data/raw/student.csv'

# Load the data
data = pd.read_csv(data_path)

# Display the first few rows of the dataframe
data.head()


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72.0,72.0,74.0
1,female,group C,some college,standard,completed,69.0,90.0,88.0
2,female,group B,master's degree,standard,none,90.0,95.0,93.0
3,male,group A,associate's degree,free/reduced,none,47.0,57.0,44.0
4,male,group C,some college,standard,none,76.0,78.0,75.0


In [15]:
# total number of entries in the DataFrame
data.shape[0]

1002

In [8]:
data.describe()

Unnamed: 0,math_score,reading_score,writing_score
count,1001.0,1001.0,1001.0
mean,66.132867,69.186813,68.080919
std,15.151545,14.602879,15.198304
min,0.0,17.0,10.0
25%,57.0,59.0,58.0
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [10]:
# Generate descriptive statistics for categorical variables
data.describe(include=['object'])

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course
count,1000,1000,1001,1001,1002
unique,2,5,6,2,2
top,female,group C,some college,standard,none
freq,518,318,227,644,643


In [11]:
# Check for duplicate rows in the DataFrame
duplicate_rows = data.duplicated().sum()
print("Number of duplicate rows:", duplicate_rows)

Number of duplicate rows: 2


In [12]:
# Count missing or null values in each column
missing_values = data.isnull().sum()
print("Number of missing or null values in each column:\n", missing_values)


Number of missing or null values in each column:
 gender                         2
race_ethnicity                 2
parental_level_of_education    1
lunch                          1
test_preparation_course        0
math_score                     1
reading_score                  1
writing_score                  1
dtype: int64


In [13]:
# Check data types and non-null counts for each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race_ethnicity               1000 non-null   object 
 2   parental_level_of_education  1001 non-null   object 
 3   lunch                        1001 non-null   object 
 4   test_preparation_course      1002 non-null   object 
 5   math_score                   1001 non-null   float64
 6   reading_score                1001 non-null   float64
 7   writing_score                1001 non-null   float64
dtypes: float64(3), object(5)
memory usage: 62.8+ KB


In [16]:
# Remove duplicate rows
data = data.drop_duplicates()

In [21]:
# Check for duplicate rows
duplicate_rows = data.duplicated().sum()
print("Number of duplicate rows after removal:", duplicate_rows)

# If the output is 0, it means all duplicates have been successfully removed
if duplicate_rows == 0:
    print("All duplicate rows have been successfully removed.")
else:
    print("There are still some duplicate rows.")

Number of duplicate rows after removal: 0
All duplicate rows have been successfully removed.


In [17]:
# Fill missing values for numeric columns with the median
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

In [18]:
# Fill missing values for categorical columns with the mode
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    mode_value = data[col].mode()[0]  # Get the mode value for the column
    data[col] = data[col].fillna(mode_value)

In [20]:
# Verify the changes
print(data.info())  # This will show the non-null count to verify filling was successful

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race_ethnicity               1000 non-null   object 
 2   parental_level_of_education  1000 non-null   object 
 3   lunch                        1000 non-null   object 
 4   test_preparation_course      1000 non-null   object 
 5   math_score                   1000 non-null   float64
 6   reading_score                1000 non-null   float64
 7   writing_score                1000 non-null   float64
dtypes: float64(3), object(5)
memory usage: 70.3+ KB
None


In [23]:
# Checking the data types of each column in 'data'
data.dtypes

gender                          object
race_ethnicity                  object
parental_level_of_education     object
lunch                           object
test_preparation_course         object
math_score                     float64
reading_score                  float64
writing_score                  float64
dtype: object

In [24]:
# Printing unique values of each categorical column to understand the data better
for col in categorical_cols:
    print(f"Categories in '{col}' variable:", data[col].unique())

Categories in 'gender' variable: ['female' 'male']
Categories in 'race_ethnicity' variable: ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in 'parental_level_of_education' variable: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in 'lunch' variable: ['standard' 'free/reduced']
Categories in 'test_preparation_course' variable: ['none' 'completed']


In [25]:
#To define the "Total Score" as a target variable, we need to calculate it first.
# Calculating the total score by summing the scores of individual subjects
data['total_score'] = data['math_score'] + data['reading_score'] + data['writing_score']

# Calculating the average score by dividing the total score by the number of subjects
data['average_score'] = data['total_score'] / 3

# Optionally, you can check how many students scored full marks in each subject
math_full = data[data['math_score'] == 100]['average_score'].count()
reading_full = data[data['reading_score'] == 100]['average_score'].count()
writing_full = data[data['writing_score'] == 100]['average_score'].count()

# Printing the number of students with full marks
print(f'Number of students with full marks in Maths: {math_full}')
print(f'Number of students with full marks in Reading: {reading_full}')
print(f'Number of students with full marks in Writing: {writing_full}')

# Display the first few rows of the dataframe to verify it.
data.head()

Number of students with full marks in Maths: 7
Number of students with full marks in Reading: 17
Number of students with full marks in Writing: 14


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,average_score
0,female,group B,bachelor's degree,standard,none,72.0,72.0,74.0,218.0,72.666667
1,female,group C,some college,standard,completed,69.0,90.0,88.0,247.0,82.333333
2,female,group B,master's degree,standard,none,90.0,95.0,93.0,278.0,92.666667
3,male,group A,associate's degree,free/reduced,none,47.0,57.0,44.0,148.0,49.333333
4,male,group C,some college,standard,none,76.0,78.0,75.0,229.0,76.333333


In [31]:
# Count students with less than 25 marks in each subject
math_less_25 = (data['math_score'] < 25).sum()
reading_less_25 = (data['reading_score'] < 25).sum()
writing_less_25 = (data['writing_score'] < 25).sum()

# Print the results
print(f"Number of students with less than 25 marks in Maths: {math_less_25}")
print(f"Number of students with less than 25 marks in Reading: {reading_less_25}")
print(f"Number of students with less than 25 marks in Writing: {writing_less_25}")

Number of students with less than 25 marks in Maths: 7
Number of students with less than 25 marks in Reading: 4
Number of students with less than 25 marks in Writing: 5


Based on the analysis of students' scores across different subjects, we observe that students tend to struggle the most in Mathematics, with a higher number of students scoring below 25 marks compared to other subjects. Conversely, the Reading section shows the best performance among students, indicating a stronger proficiency or possibly more effective teaching methods in this area.