In [46]:
import numpy as np
import pandas as pd

In [47]:
student_names = ['John', 'Alice', 'Bob', 'Eva', 'Charlie', 'Olivia', 'Daniel', 'Sophia', 'David', 'Emma',
                 'James', 'Grace', 'William', 'Ava', 'Michael', 'Chloe', 'Matthew', 'Emily', 'Jacob', 'Mia',
                 'Benjamin', 'Ella', 'Christopher', 'Aiden', 'Liam', 'Zoe', 'Samuel', 'Lily', 'Logan', 'Lucy',
                 'Jackson', 'Avery', 'Elijah', 'Harper', 'Daniel', 'Isabella', 'Sebastian', 'Mila', 'Alexander', 'Scarlett']

df = pd.DataFrame(
    {
        'Student_id' : np.arange(1,41),
        'Name' : student_names,
        'Dept' : np.random.choice(['CS', 'IT', 'AI&DS'], 40),
        'Admission_year' : np.random.choice([2022, 2021, 2023], 40),
        'Class' : np.random.choice(['SE', 'TE'], 40),
        'Result' : np.random.randint(20, 101,size= 40),
        'Placed' : np.random.choice(['Yes', 'No'], 40)
    }
)
df

Unnamed: 0,Student_id,Name,Dept,Admission_year,Class,Result,Placed
0,1,John,IT,2021,SE,94,Yes
1,2,Alice,CS,2021,TE,84,No
2,3,Bob,IT,2022,TE,21,No
3,4,Eva,AI&DS,2023,TE,62,Yes
4,5,Charlie,CS,2023,TE,97,No
5,6,Olivia,AI&DS,2021,SE,85,Yes
6,7,Daniel,CS,2022,SE,38,Yes
7,8,Sophia,IT,2023,SE,40,Yes
8,9,David,AI&DS,2021,SE,56,No
9,10,Emma,AI&DS,2021,TE,60,Yes


In [48]:
for column in df.columns[2:]:
    random_indices = np.random.choice(df.index, size=5, replace= False)
    df.loc[random_indices, column] = np.nan
df

Unnamed: 0,Student_id,Name,Dept,Admission_year,Class,Result,Placed
0,1,John,IT,2021.0,SE,94.0,Yes
1,2,Alice,CS,2021.0,TE,84.0,No
2,3,Bob,IT,2022.0,,21.0,No
3,4,Eva,AI&DS,2023.0,TE,62.0,Yes
4,5,Charlie,CS,2023.0,TE,97.0,No
5,6,Olivia,AI&DS,2021.0,SE,85.0,Yes
6,7,Daniel,,2022.0,SE,,Yes
7,8,Sophia,IT,2023.0,SE,40.0,
8,9,David,,2021.0,SE,,No
9,10,Emma,,,TE,60.0,Yes


In [49]:
new_row = pd.DataFrame(
[
{
    'Student_id' : 41,
    'Name' : 'Sahit',
    'Dept' : 'AI&DS',
    'Admission_year' : 2021,
    'Class' : 'TE',
    'Result' : 180.0,
    'Placed' : 'No'
}
]
)
df = pd.concat([df, new_row], ignore_index=True)

In [50]:
df

Unnamed: 0,Student_id,Name,Dept,Admission_year,Class,Result,Placed
0,1,John,IT,2021.0,SE,94.0,Yes
1,2,Alice,CS,2021.0,TE,84.0,No
2,3,Bob,IT,2022.0,,21.0,No
3,4,Eva,AI&DS,2023.0,TE,62.0,Yes
4,5,Charlie,CS,2023.0,TE,97.0,No
5,6,Olivia,AI&DS,2021.0,SE,85.0,Yes
6,7,Daniel,,2022.0,SE,,Yes
7,8,Sophia,IT,2023.0,SE,40.0,
8,9,David,,2021.0,SE,,No
9,10,Emma,,,TE,60.0,Yes


In [51]:
df.isnull().sum()

Student_id        0
Name              0
Dept              5
Admission_year    5
Class             5
Result            5
Placed            5
dtype: int64

In [52]:
non_numerical_columns = df.select_dtypes(exclude=np.number).columns
numerical = df.drop(columns= non_numerical_columns)
numerical

Unnamed: 0,Student_id,Admission_year,Result
0,1,2021.0,94.0
1,2,2021.0,84.0
2,3,2022.0,21.0
3,4,2023.0,62.0
4,5,2023.0,97.0
5,6,2021.0,85.0
6,7,2022.0,
7,8,2023.0,40.0
8,9,2021.0,
9,10,,60.0


In [53]:
Q1 = numerical.quantile(0.25)
Q3 = numerical.quantile(0.75)

IQR = Q3 - Q1

lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

outliers = ((numerical < lower_limit) | (upper_limit < numerical)).any(axis= 1)

df[outliers]

Unnamed: 0,Student_id,Name,Dept,Admission_year,Class,Result,Placed
40,41,Sahit,AI&DS,2021.0,TE,180.0,No


In [54]:
cleaned_df = df[~outliers]
cleaned_df

Unnamed: 0,Student_id,Name,Dept,Admission_year,Class,Result,Placed
0,1,John,IT,2021.0,SE,94.0,Yes
1,2,Alice,CS,2021.0,TE,84.0,No
2,3,Bob,IT,2022.0,,21.0,No
3,4,Eva,AI&DS,2023.0,TE,62.0,Yes
4,5,Charlie,CS,2023.0,TE,97.0,No
5,6,Olivia,AI&DS,2021.0,SE,85.0,Yes
6,7,Daniel,,2022.0,SE,,Yes
7,8,Sophia,IT,2023.0,SE,40.0,
8,9,David,,2021.0,SE,,No
9,10,Emma,,,TE,60.0,Yes


In [55]:
result_mean = df['Result'].mean()
df['Result'].fillna(result_mean, inplace=True)

In [56]:
df.isna().sum()

Student_id        0
Name              0
Dept              5
Admission_year    5
Class             5
Result            0
Placed            5
dtype: int64

In [57]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler() 
x_scaled = min_max_scaler.fit_transform(df[['Result']]) 
df['Result'] = x_scaled 
df['Result']

0     0.462500
1     0.400000
2     0.006250
3     0.262500
4     0.481250
5     0.406250
6     0.248264
7     0.125000
8     0.248264
9     0.250000
10    0.143750
11    0.000000
12    0.425000
13    0.006250
14    0.081250
15    0.368750
16    0.312500
17    0.156250
18    0.418750
19    0.143750
20    0.306250
21    0.462500
22    0.462500
23    0.087500
24    0.018750
25    0.331250
26    0.125000
27    0.143750
28    0.118750
29    0.025000
30    0.248264
31    0.206250
32    0.275000
33    0.006250
34    0.087500
35    0.468750
36    0.248264
37    0.187500
38    0.248264
39    0.175000
40    1.000000
Name: Result, dtype: float64

In [58]:
df['Result'].describe()

count    41.000000
mean      0.248264
std       0.191211
min       0.000000
25%       0.125000
50%       0.248264
75%       0.368750
max       1.000000
Name: Result, dtype: float64

In [59]:
df.shape

(41, 7)