In [1]:
import pandas as pd
import numpy as np

# Step 0: Create the dataset
data = {
    'Student_ID': [1, 2, 3, 4, 5, 6, 7],
    'Math_Score': [78, 85, np.nan, 95, 105, 70, 1000],  # includes a missing value and an outlier
    'Reading_Score': [80, 82, 85, 78, 90, 86, 88],
    'Writing_Score': [75, 80, 85, np.nan, 88, 90, 95]   # includes a missing value
}

df = pd.DataFrame(data)
print("Original Dataset:")
print(df)

# Step 1: Handling Missing Values
print("\nMissing Values:\n", df.isnull().sum())

# Fill missing values with mean
 # Fill missing values with mean (updated syntax to avoid FutureWarning)
df['Math_Score'] = df['Math_Score'].fillna(df['Math_Score'].mean())
df['Writing_Score'] = df['Writing_Score'].fillna(df['Writing_Score'].mean())

print("\nDataset after handling missing values:")
print(df)

# Step 2: Handling Outliers using IQR method
def cap_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    return column.clip(lower_limit, upper_limit)

df['Math_Score'] = cap_outliers(df['Math_Score'])

print("\nDataset after handling outliers:")
print(df)

# Step 3: Data Transformation (Log transformation to reduce skewness)
df['Log_Math_Score'] = np.log(df['Math_Score'])

print("\nDataset after log transformation:")
print(df[['Math_Score', 'Log_Math_Score']])


Original Dataset:
   Student_ID  Math_Score  Reading_Score  Writing_Score
0           1        78.0             80           75.0
1           2        85.0             82           80.0
2           3         NaN             85           85.0
3           4        95.0             78            NaN
4           5       105.0             90           88.0
5           6        70.0             86           90.0
6           7      1000.0             88           95.0

Missing Values:
 Student_ID       0
Math_Score       1
Reading_Score    0
Writing_Score    1
dtype: int64

Dataset after handling missing values:
   Student_ID   Math_Score  Reading_Score  Writing_Score
0           1    78.000000             80           75.0
1           2    85.000000             82           80.0
2           3   238.833333             85           85.0
3           4    95.000000             78           85.5
4           5   105.000000             90           88.0
5           6    70.000000             86    