Data Generation

In [26]:
import pandas as pd
import numpy as np

# Define subjects
subjects = ['Math', 'Physics', 'English', 'History', 'Biology', 'Chemistry']

# Generate random data
num_students = 50
data = []
for i in range(num_students):
    student_data = {
        'Roll no.': i + 1,
        # 'Name': f'Student {i + 1}',
    }
    # Add random scores for each subject
    for subject in subjects:
        score = np.random.randint(50, 100)
        # Introduce 10% missing values randomly
        if np.random.rand() < 0.1:
            score = None
        # Introduce 5% outliers (very high or low scores)
        if np.random.rand() < 0.05:
            if np.random.rand() < 0.5:
                score = np.random.randint(-20, 0)
            else:
                score = np.random.randint(100, 120)
        student_data[subject] = score
    data.append(student_data)

# Create DataFrame
df = pd.DataFrame(data)

# Print the dataset
print(df)


    Roll no.   Math  Physics  English  History  Biology  Chemistry
0          1   60.0     64.0     95.0     87.0     78.0       88.0
1          2   99.0     -1.0      NaN     86.0    102.0       50.0
2          3   53.0     94.0     56.0     64.0     51.0       92.0
3          4   72.0     94.0      NaN     73.0     84.0        NaN
4          5   97.0     92.0     73.0     61.0     99.0       74.0
5          6   83.0     91.0     87.0    -17.0     71.0       98.0
6          7    NaN     83.0     96.0     65.0     56.0       84.0
7          8   54.0     70.0     56.0     92.0     94.0       80.0
8          9   64.0     88.0     51.0      NaN     97.0       95.0
9         10   61.0     57.0     59.0     92.0     67.0       75.0
10        11   85.0     78.0      NaN     82.0     82.0       53.0
11        12   64.0     87.0    -13.0     94.0     89.0       68.0
12        13   60.0     50.0     78.0     83.0     88.0       98.0
13        14   98.0     71.0     60.0     74.0     80.0       

Objective 1: Dealing with missing values and inconsistencies


In [27]:
# Handling missing values for numeric columns only
df_numeric = df.select_dtypes(include=[np.number])
df_numeric.fillna(0, inplace=True)

# Checking for missing values again
print("Missing Values After Handling:")
print(df_numeric.isnull().sum())

print(df_numeric)


Missing Values After Handling:
Roll no.     0
Math         0
Physics      0
English      0
History      0
Biology      0
Chemistry    0
dtype: int64
    Roll no.   Math  Physics  English  History  Biology  Chemistry
0          1   60.0     64.0     95.0     87.0     78.0       88.0
1          2   99.0     -1.0      0.0     86.0    102.0       50.0
2          3   53.0     94.0     56.0     64.0     51.0       92.0
3          4   72.0     94.0      0.0     73.0     84.0        0.0
4          5   97.0     92.0     73.0     61.0     99.0       74.0
5          6   83.0     91.0     87.0    -17.0     71.0       98.0
6          7    0.0     83.0     96.0     65.0     56.0       84.0
7          8   54.0     70.0     56.0     92.0     94.0       80.0
8          9   64.0     88.0     51.0      0.0     97.0       95.0
9         10   61.0     57.0     59.0     92.0     67.0       75.0
10        11   85.0     78.0      0.0     82.0     82.0       53.0
11        12   64.0     87.0    -13.0     94.0 

Objective 2: Detecting and handling outliers

In [28]:
# Handling outliers for numeric columns only
df_numeric = df_numeric.select_dtypes(include=[np.number])


# Define upper and lower bounds for outliers
lower_bound = 0
upper_bound = 100

# Handling outliers by replacing them with the closest non-outlier value
for col in df_numeric.columns:
    df_numeric[col] = np.where(df_numeric[col] < lower_bound, lower_bound, df_numeric[col ])
    df_numeric[col] = np.where(df_numeric[col] > upper_bound, lower_bound, df_numeric[col])

# Checking for outliers again
print("Outliers After Handling:")
print(df_numeric)


Outliers After Handling:
    Roll no.  Math  Physics  English  History  Biology  Chemistry
0          1  60.0     64.0     95.0     87.0     78.0       88.0
1          2  99.0      0.0      0.0     86.0      0.0       50.0
2          3  53.0     94.0     56.0     64.0     51.0       92.0
3          4  72.0     94.0      0.0     73.0     84.0        0.0
4          5  97.0     92.0     73.0     61.0     99.0       74.0
5          6  83.0     91.0     87.0      0.0     71.0       98.0
6          7   0.0     83.0     96.0     65.0     56.0       84.0
7          8  54.0     70.0     56.0     92.0     94.0       80.0
8          9  64.0     88.0     51.0      0.0     97.0       95.0
9         10  61.0     57.0     59.0     92.0     67.0       75.0
10        11  85.0     78.0      0.0     82.0     82.0       53.0
11        12  64.0     87.0      0.0     94.0     89.0       68.0
12        13  60.0     50.0     78.0     83.0     88.0       98.0
13        14  98.0     71.0     60.0     74.0     8

Objective 3: Applying data transformations

In [29]:
# Applying data transformations
# For example, we can apply min-max scaling to the scores to change their scale to a common range (0 to 1)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# Filling missing values again before transformation
df_numeric.fillna(df_numeric.mean(), inplace=True)

# Applying data transformations
scaler = MinMaxScaler()
df_numeric[subjects] = scaler.fit_transform(df_numeric[subjects])

# Checking the transformed data
print("Transformed Data:")
print(df_numeric)


Transformed Data:
    Roll no.      Math   Physics   English   History   Biology  Chemistry
0          1  0.606061  0.646465  0.969388  0.878788  0.787879   0.897959
1          2  1.000000  0.000000  0.000000  0.868687  0.000000   0.510204
2          3  0.535354  0.949495  0.571429  0.646465  0.515152   0.938776
3          4  0.727273  0.949495  0.000000  0.737374  0.848485   0.000000
4          5  0.979798  0.929293  0.744898  0.616162  1.000000   0.755102
5          6  0.838384  0.919192  0.887755  0.000000  0.717172   1.000000
6          7  0.000000  0.838384  0.979592  0.656566  0.565657   0.857143
7          8  0.545455  0.707071  0.571429  0.929293  0.949495   0.816327
8          9  0.646465  0.888889  0.520408  0.000000  0.979798   0.969388
9         10  0.616162  0.575758  0.602041  0.929293  0.676768   0.765306
10        11  0.858586  0.787879  0.000000  0.828283  0.828283   0.540816
11        12  0.646465  0.878788  0.000000  0.949495  0.898990   0.693878
12        13  0.6060