In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
np.random.seed(42)

data = {
    'StudentID': range(1, 101),
    'Math_Score': np.random.randint(50, 100, size=100),
    'English_Score': np.random.randint(40, 95, size=100),
    'Science_Score': np.random.randint(55, 98, size=100),
    'Attendance': np.random.uniform(70, 100, size=100),
    'Study_Hours': np.random.randint(1, 6, size=100),
}

df = pd.DataFrame(data)
df

In [None]:
#Add missing values to the dataset

df.loc[5:10, 'Math_Score'] = np.nan
df.loc[15:25, 'English_Score'] = np.nan
df.loc[25:30, 'Science_Score'] = np.nan
df.loc[35:45, 'Attendance'] = np.nan

df.head(10)

In [None]:
df.isnull().sum()

In [None]:
df.fillna(df.mean(), inplace=True)
df.isnull().sum()

In [None]:
#Adding Outliers
df.loc[0, 'Math_Score'] = 2
df.loc[94, 'Math_Score'] = 20
df.loc[6, 'English_Score'] = 5
df.loc[20, 'English_Score'] = 10
df.loc[26, 'Science_Score'] = 10
df.loc[3, 'Attendance'] = 30
df.loc[20, 'Attendance'] = 5

In [None]:
num_cols = df.iloc[:, 1:5].select_dtypes(include= ['int64', 'float64']).columns

In [None]:
sns.boxplot(data=df[num_cols])
plt.title("Before Handling Outliers")
plt.show()

In [None]:
z_scores = (df[num_cols] - df[num_cols].mean()) / df[num_cols].std()
outliers = (z_scores.abs() > 3)

df[outliers] = np.nan

df.fillna(df.mean(), inplace=True)

In [None]:
sns.boxplot(data=df[num_cols])
plt.title("After Handling Outliers")
plt.show()

Data Transformation:
The variable Study_Hours showed a right-skewed distribution. To address this, a logarithmic transformation (Log_Study_Hours = log(Study_Hours)) was applied. This transformation reduces skewness and brings the distribution closer to normal, which is beneficial for statistical analysis and modeling tasks that assume normality.

In [None]:
df['Log_Study_Hours'] = np.log(df['Study_Hours'])
df

In [None]:
plt.figure(figsize=(10, 6))

plt.hist(df['Log_Study_Hours'], bins=10, edgecolor='black')
plt.title('Distribution of Log_Study_Hours')
plt.xlabel('Log_Study_Hours')
plt.ylabel('Frequency')

In [None]:
plt.figure(figsize=(10, 6))

plt.hist(df['Study_Hours'], bins=10, edgecolor='black')
plt.title('Distribution of Study_Hours')
plt.xlabel('Study_Hours')
plt.ylabel('Frequency')