In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Step 1:Load the Data
df=pd.read_csv("/kaggle/input/students-performance-10000-clean-data-eda/Student_performance_10k.csv")
df.head()

In [None]:
#Step 2:Understanding the Data:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#Step 3:Identifying Missing Values
missing_values = df.isnull().sum()
missing_values

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Heatmap of Missing Values')
plt.show()

In [None]:
numeric_columns = ['math_score', 'reading_score', 'writing_score', 'science_score', 'total_score']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, coerce invalid entries to NaN

# Handle missing values
for col in numeric_columns:
    df[col].fillna(df[col].median(), inplace=True)  # Impute missing values with median


In [None]:
df.isnull().sum()

In [None]:
# Handle missing values for categorical columns using mode imputation
categorical_columns = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course', 'grade']

for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])  # Fill missing values with the mode


In [None]:
df.isnull().sum()

In [None]:
missing_rollno_rows = df[df['roll_no'].isnull()]
missing_rollno_rows

In [None]:
df.iloc[1320:1325]

In [None]:
df.loc[1323,"roll_no"]="std-1324"

In [None]:
df.isnull().sum()

In [None]:
df.iloc[1320:1325]

In [None]:
#Step 4:Handle Duplicates
duplicate_count = df.duplicated().sum()
duplicate_count

In [None]:
if duplicate_count > 0:
    df = df.drop_duplicates()

In [None]:
#Step 5: Check for Inconsistent or Faulty Data
df["gender"].unique()

In [None]:
def clean_gender(gender):
    gender = gender.strip().lower()  
    if gender in ['male', 'boy', '\\tmale']:
        return 'male'
    elif gender in ['female', 'girl']:
        return 'female'
    else:
        return gender  # Retain the original if no match


In [None]:
df['gender'] = df['gender'].apply(clean_gender)

In [None]:
df["gender"].unique()

In [None]:
df["race_ethnicity"].unique()

In [None]:
def clean_race_ethnicity(group):
    group = group.strip().replace('\\n', '')  # Remove whitespace and newline characters
    if not group.startswith('group'):  # Add 'group' if it's missing
        group = f'group {group}'
    return group

In [None]:
df["race_ethnicity"]=df["race_ethnicity"].apply(clean_race_ethnicity)

In [None]:
df["race_ethnicity"].unique()

In [None]:
df['math_score'] = pd.to_numeric(df['math_score'], errors='coerce')

In [None]:
df.head()

In [None]:
#Step 6: Droping Irrelevant Columns
df = df.drop(['roll_no'], axis=1)

In [None]:
#Step 7: Convert Data Types
df['gender'] = df['gender'].astype('category')
df['grade'] = df['grade'].astype('category')


df['math_score'] = df['math_score'].astype(float)
df['reading_score'] = df['reading_score'].astype(float)
df['writing_score'] = df['writing_score'].astype(float)
df['science_score'] = df['science_score'].astype(float)
df['total_score'] = df['total_score'].astype(float)


print("\nAfter conversion:")
print(df.info())


In [None]:
#Step 8: Explore Distributions
print(df['gender'].value_counts())

print(df['grade'].value_counts())


In [None]:
# Plot histograms
sns.histplot(df['math_score'], kde=True, bins=10, color='blue', label='Math Score')
plt.title('Distribution of Math Scores')
plt.legend()
plt.show()

sns.histplot(df['reading_score'], kde=True, bins=10, color='green', label='Reading Score')
plt.title('Distribution of Reading Scores')
plt.legend()
plt.show()


In [None]:
# Box plot for math scores
sns.boxplot(x=df['math_score'], color='orange')
plt.title('Box Plot of Math Scores')
plt.show()

# Box plot for reading scores
sns.boxplot(x=df['reading_score'], color='purple')
plt.title('Box Plot of Reading Scores')
plt.show()


In [None]:
#Step 9: Handeling Outliers
Q1 = df['math_score'].quantile(0.25)
Q3 = df['math_score'].quantile(0.75)
IQR = Q3 - Q1  

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['math_score'] < lower_bound) | (df['math_score'] > upper_bound)]
print("Outliers in math_score:")
print(outliers)

In [None]:
#Remove Outliers
df_cleaned = df[(df['math_score'] >= lower_bound) & (df['math_score'] <= upper_bound)]

In [None]:
#Step 10: Encode Categorical Variables
df_one_hot = pd.get_dummies(df, columns=['gender', 'grade'], drop_first=True)
print("One-hot encoded data:")
print(df_one_hot)

In [None]:
le_gender = LabelEncoder()
le_grade = LabelEncoder()

df['gender_encoded'] = le_gender.fit_transform(df['gender'])
df['grade_encoded'] = le_grade.fit_transform(df['grade'])
print("Label encoded data:")
print(df)

In [None]:
#Step 11: Correlation Analysis
numeric_df = df.select_dtypes(include=['number'])

correlation_matrix = numeric_df.corr()

print("Correlation matrix:")
print(correlation_matrix)


In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
#Step 12: Investigate Relationships
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='math_score', y='total_score', hue='gender', style='gender', s=100)
plt.title('Math Score vs. Total Score')
plt.xlabel('Math Score')
plt.ylabel('Total Score')
plt.legend(title='Gender')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(data=df, x='gender', y='math_score', ci='sd', palette='coolwarm')
plt.title('Gender vs. Math Score')
plt.xlabel('Gender')
plt.ylabel('Average Math Score')
plt.show()

In [None]:
#Step 13: Feature Engineering
max_score = 300

df['performance_ratio'] = df['total_score'] / max_score

print("Data with performance_ratio:")
print(df)

In [None]:
bins = [0, 0.7, 0.85, 1.0]  
labels = ['low', 'medium', 'high'] 

df['performance_category'] = pd.cut(df['performance_ratio'], bins=bins, labels=labels)

print("Data with performance_category:")
print(df)