### Missing Value Treatment

In [736]:
import pandas as pd
import numpy as np

In [737]:
data = { 'Age': [25, 30, np.nan, 35, 40, np.nan, 22, 28, 45, 50, np.nan, 37, 29, 41, 48, 33, 38, np.nan, 44, 51], 
        'Income': [50000, 60000, 75000, np.nan, 80000, 42000, 55000, np.nan, 67000, 72000, np.nan, 69000, 59000, 68000, 62000, 71000, 58000, np.nan, 70000, np.nan],
        'Gender': ['M', 'F', np.nan, 'F', 'M', np.nan, 'M', 'F', 'M', 'M', np.nan, 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'F', 'M'], 
        'Education': ['Bachelor', 'Master', np.nan, 'Bachelor', 'Bachelor', 'Master', 'PhD', np.nan, 'Bachelor', 'PhD', 'Bachelor', 'Master', 'PhD', 'Bachelor', 'Master', 'Bachelor', 'Master', 'PhD', 'Bachelor', np.nan], 
        'Employment_Status': ['Employed', 'Employed', 'Unemployed', 'Employed', 'Employed', 'Employed', np.nan, 'Employed', 'Employed', 'Employed', 'Employed', 'Unemployed', 'Employed', 'Employed', 'Employed', 'Employed', 'Unemployed', 'Employed', 'Employed', 'Employed'],
        'Years_of_Experience': [3, 5, 0, 8, 10, 2, 0, 4, 12, 15, np.nan, 9, 4, 11, 14, 7, 13, 1, 5, 16],
        'Marital_Status': ['Single', 'Married', 'Single', 'Married', 'Single', 'Married', np.nan, 'Married', 'Single', 'Married', 'Single', 'Married', 'Single', 'Married', 'Single', 'Married', 'Single', np.nan, 'Married', 'Married'],
        'Children_Count': [0, 1, 0, 2, 0, 2, np.nan, 3, 1, 2, 2, 1, 0, 3, 2, 1, 2, 0, 2, np.nan],
        'Home_Ownership': ['Rent', 'Own', 'Rent', 'Own', 'Rent', 'Own', np.nan, 'Own', 'Rent', 'Own', 'Rent', 'Own', 'Rent', 'Own', 'Rent', 'Own', 'Rent', 'Rent', 'Own', np.nan] }

In [738]:
df=pd.DataFrame(data)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership
0,25.0,50000.0,M,Bachelor,Employed,3.0,Single,0.0,Rent
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own
2,,75000.0,,,Unemployed,0.0,Single,0.0,Rent
3,35.0,,F,Bachelor,Employed,8.0,Married,2.0,Own
4,40.0,80000.0,M,Bachelor,Employed,10.0,Single,0.0,Rent
5,,42000.0,,Master,Employed,2.0,Married,2.0,Own
6,22.0,55000.0,M,PhD,,0.0,,,
7,28.0,,F,,Employed,4.0,Married,3.0,Own
8,45.0,67000.0,M,Bachelor,Employed,12.0,Single,1.0,Rent
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own


### What percentage of missing values is there in the 'Income' column?

In [739]:
specified_column = 'Income'
missing_percentage = (df[specified_column].isnull().sum() / len(df)) * 100
print(f"Percentage of missing values in column '{specified_column}': {missing_percentage:.2f}%")


Percentage of missing values in column 'Income': 25.00%


### What is the most common education level ('Education' column) among the non-missing values?

In [740]:
mode_value = df['Education'].mode().iloc[0]      #iloc[0] is used to select the first (most common) mode value from the Series
mode_value


'Bachelor'

### How many individuals are employed ('Employment_Status' column) among the non-missing values?

In [741]:
column = 'Employment_Status'
value = 'Employed'
count = df[df[column].notnull()][column].eq(value).sum()
count


16

### What is the average 'Years_of_Experience' among individuals with non-missing values for this column?

In [742]:
average = df['Years_of_Experience'].mean()
average

7.315789473684211

### How many individuals have 'Marital_Status' set to 'Married' among the non-missing values?

In [743]:
column = 'Marital_Status'
value = 'Married'
count = df[df[column].notnull()][column].eq(value).sum()
count

10

### What is the most common home ownership status ('Home_Ownership' column) among the non-missing values?

In [744]:
common_status = df['Home_Ownership'].mode().iloc[0]
common_status

'Own'

### change the value of employee status to unemployed if the education is bachelor

In [745]:
df.loc[df['Education'] == 'Bachelor', 'Employment_Status'] = 'Unemployed'
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own
2,,75000.0,,,Unemployed,0.0,Single,0.0,Rent
3,35.0,,F,Bachelor,Unemployed,8.0,Married,2.0,Own
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent
5,,42000.0,,Master,Employed,2.0,Married,2.0,Own
6,22.0,55000.0,M,PhD,,0.0,,,
7,28.0,,F,,Employed,4.0,Married,3.0,Own
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own


### find the average sallary of each educational group

In [746]:
average_salary = df.groupby('Education')['Income'].mean()
average_salary

Education
Bachelor    67666.666667
Master      58200.000000
PhD         62000.000000
Name: Income, dtype: float64

### Random Sample Imputation

In [747]:
non_missing_values = df['Education'].dropna()
non_missing_values

0     Bachelor
1       Master
3     Bachelor
4     Bachelor
5       Master
6          PhD
8     Bachelor
9          PhD
10    Bachelor
11      Master
12         PhD
13    Bachelor
14      Master
15    Bachelor
16      Master
17         PhD
18    Bachelor
Name: Education, dtype: object

In [748]:
num_missing_values = df['Education'].isnull().sum()
num_missing_values

3

In [749]:
random_indices = np.random.choice(non_missing_values.index, size=num_missing_values, replace=True)
random_indices

array([16, 13, 12], dtype=int64)

In [750]:
df.loc[df['Education'].isnull(), 'Education'] = non_missing_values.loc[random_indices].values
df.loc[df['Education'].isnull(), 'Education']

Series([], Name: Education, dtype: object)

In [751]:
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own
2,,75000.0,,Master,Unemployed,0.0,Single,0.0,Rent
3,35.0,,F,Bachelor,Unemployed,8.0,Married,2.0,Own
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent
5,,42000.0,,Master,Employed,2.0,Married,2.0,Own
6,22.0,55000.0,M,PhD,,0.0,,,
7,28.0,,F,Bachelor,Employed,4.0,Married,3.0,Own
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own


### Capturing NAN values with a new feature

In [752]:
df['Age_Missing'] = df['Age'].isnull().astype(int)
df['Income_Missing'] = df['Income'].isnull().astype(int)
df['Gender_Missing'] = df['Gender'].isnull().astype(int)

In [753]:
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,,75000.0,,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,,42000.0,,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,,0.0,,,,0,0,0
7,28.0,,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


### Arbitrary imputation

In [754]:
arbitrary_value = -1
df['Age'].fillna(arbitrary_value)

0     25.0
1     30.0
2     -1.0
3     35.0
4     40.0
5     -1.0
6     22.0
7     28.0
8     45.0
9     50.0
10    -1.0
11    37.0
12    29.0
13    41.0
14    48.0
15    33.0
16    38.0
17    -1.0
18    44.0
19    51.0
Name: Age, dtype: float64

### End of Distribution imputation

In [755]:
df['Age'].fillna(df['Age'].max(), inplace=True)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,51.0,75000.0,,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,51.0,42000.0,,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,,0.0,,,,0,0,0
7,28.0,,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


In [756]:
x = df["Age"].mean()

df["Age"].fillna(x,inplace=True)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,51.0,75000.0,,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,51.0,42000.0,,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,,0.0,,,,0,0,0
7,28.0,,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


In [757]:
x = df["Income"].mean()
df["Income"].fillna(x,inplace=True)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,51.0,75000.0,,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,63866.666667,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,51.0,42000.0,,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,,0.0,,,,0,0,0
7,28.0,63866.666667,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


In [758]:
x = df["Gender"].mode()[0]
df["Gender"].fillna(x,inplace=True)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,51.0,75000.0,M,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,63866.666667,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,51.0,42000.0,M,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,,0.0,,,,0,0,0
7,28.0,63866.666667,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


In [759]:
x = df["Age"].mean()
df["Age"].fillna(x,inplace=True)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,51.0,75000.0,M,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,63866.666667,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,51.0,42000.0,M,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,,0.0,,,,0,0,0
7,28.0,63866.666667,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


### Frequent categories imputation

In [760]:
x = df["Education"].mode()[0]

df["Education"].fillna(x, inplace = True)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,51.0,75000.0,M,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,63866.666667,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,51.0,42000.0,M,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,,0.0,,,,0,0,0
7,28.0,63866.666667,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


In [761]:
x = df["Employment_Status"].mode()[0]

df["Employment_Status"].fillna(x, inplace = True)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,51.0,75000.0,M,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,63866.666667,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,51.0,42000.0,M,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,Unemployed,0.0,,,,0,0,0
7,28.0,63866.666667,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


In [762]:
x = df["Years_of_Experience"].mean()
df["Years_of_Experience"].fillna(x,inplace=True)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,51.0,75000.0,M,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,63866.666667,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,51.0,42000.0,M,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,Unemployed,0.0,,,,0,0,0
7,28.0,63866.666667,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


In [763]:
x = df["Marital_Status"].mode()[0]

df["Marital_Status"].fillna(x, inplace = True)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,51.0,75000.0,M,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,63866.666667,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,51.0,42000.0,M,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,Unemployed,0.0,Married,,,0,0,0
7,28.0,63866.666667,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


In [764]:
x = df["Children_Count"].mean()
df["Children_Count"].fillna(x,inplace=True)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,51.0,75000.0,M,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,63866.666667,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,51.0,42000.0,M,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,Unemployed,0.0,Married,1.333333,,0,0,0
7,28.0,63866.666667,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


In [765]:
x = df["Home_Ownership"].mode()[0]

df["Home_Ownership"].fillna(x, inplace = True)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25.0,50000.0,M,Bachelor,Unemployed,3.0,Single,0.0,Rent,0,0,0
1,30.0,60000.0,F,Master,Employed,5.0,Married,1.0,Own,0,0,0
2,51.0,75000.0,M,Master,Unemployed,0.0,Single,0.0,Rent,1,0,1
3,35.0,63866.666667,F,Bachelor,Unemployed,8.0,Married,2.0,Own,0,1,0
4,40.0,80000.0,M,Bachelor,Unemployed,10.0,Single,0.0,Rent,0,0,0
5,51.0,42000.0,M,Master,Employed,2.0,Married,2.0,Own,1,0,1
6,22.0,55000.0,M,PhD,Unemployed,0.0,Married,1.333333,Own,0,0,0
7,28.0,63866.666667,F,Bachelor,Employed,4.0,Married,3.0,Own,0,1,0
8,45.0,67000.0,M,Bachelor,Unemployed,12.0,Single,1.0,Rent,0,0,0
9,50.0,72000.0,M,PhD,Employed,15.0,Married,2.0,Own,0,0,0


In [766]:
columns_to_convert = ['Age', 'Income','Years_of_Experience','Children_Count']
df[columns_to_convert] = df[columns_to_convert].astype(int)
df

Unnamed: 0,Age,Income,Gender,Education,Employment_Status,Years_of_Experience,Marital_Status,Children_Count,Home_Ownership,Age_Missing,Income_Missing,Gender_Missing
0,25,50000,M,Bachelor,Unemployed,3,Single,0,Rent,0,0,0
1,30,60000,F,Master,Employed,5,Married,1,Own,0,0,0
2,51,75000,M,Master,Unemployed,0,Single,0,Rent,1,0,1
3,35,63866,F,Bachelor,Unemployed,8,Married,2,Own,0,1,0
4,40,80000,M,Bachelor,Unemployed,10,Single,0,Rent,0,0,0
5,51,42000,M,Master,Employed,2,Married,2,Own,1,0,1
6,22,55000,M,PhD,Unemployed,0,Married,1,Own,0,0,0
7,28,63866,F,Bachelor,Employed,4,Married,3,Own,0,1,0
8,45,67000,M,Bachelor,Unemployed,12,Single,1,Rent,0,0,0
9,50,72000,M,PhD,Employed,15,Married,2,Own,0,0,0


In [767]:
data = {
    'Age': [25, 30, 22, np.nan, np.nan, np.nan, 28, 32, 29],
}

df = pd.DataFrame(data)

# Calculate the extreme value (e.g., 99th percentile) for Age
extreme_value = df['Age'].quantile(0.99)

# Replace missing values with the extreme value
df['Age'].fillna(extreme_value, inplace=True)

# Print the updated DataFrame
print(df)

    Age
0  25.0
1  30.0
2  22.0
3  31.9
4  31.9
5  31.9
6  28.0
7  32.0
8  29.0


In [768]:
df

Unnamed: 0,Age
0,25.0
1,30.0
2,22.0
3,31.9
4,31.9
5,31.9
6,28.0
7,32.0
8,29.0
