In [37]:
import pandas as pd
import numpy as np

In [38]:
# Generate random data
np.random.seed(0)
n = 100  # Number of students

df = pd.DataFrame({
    'Age': np.random.randint(18, 25, n),
    'Gender': np.random.choice(['Male', 'Female'], n),
    'Grade': np.random.randint(50, 100, n),
    'Absenteeism': np.random.randint(0, 10, n),
    'StudyHours': np.random.randint(1, 10, n)
})


In [39]:
missing_values = df.isnull().sum()
inconsistencies = (df['Grade'] < 0) | (df['Grade'] > 100)


In [40]:
# Replace missing values with the mean
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Replace inconsistencies with the median
df.loc[inconsistencies, 'Grade'] = df['Grade'].median()

In [41]:
numeric_variables = ['Grade', 'Absenteeism', 'StudyHours']
outliers = pd.DataFrame()
for var in numeric_variables:
    q1 = df[var].quantile(0.25)
    q3 = df[var].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers[var] = (df[var] < lower_bound) | (df[var] > upper_bound)
outliers

Unnamed: 0,Grade,Absenteeism,StudyHours
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
95,False,False,False
96,False,False,False
97,False,False,False
98,False,False,False


In [42]:
import matplotlib.pyplot as plt
print(df['Grade'].skew())

for var in numeric_variables:
    # Remove outliers using winsorization
    lower_bound = df[var].quantile(0.05)
    upper_bound = df[var].quantile(0.95)
    df[var] = np.where(df[var] < lower_bound, lower_bound, df[var])
    df[var] = np.where(df[var] > upper_bound, upper_bound, df[var])
    # Apply data transformation (log transformation) to 'Grade' variable
    if var == 'Grade':
        df[var] = np.log10(df[var])
        df[var]=np.sqrt(df[var])
print(df['Grade'].skew())


-0.09167251356086972
-0.3424612305054338


In [35]:
import matplotlib.pyplot as plt
df.boxplot(column=['Grade'])

<Axes: >