1. write a python program to demostrate various type of cleaning operation using data exploration & imputation.

In [6]:
import pandas as pd
import numpy as np

# Create a sample DataFrame
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': ['one', 'two', 'three', np.nan, 'five'],
    'C': [np.nan, 2, 3, 4, 5],
    'D': [1, 2, 2, np.nan, 5]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# 1. Handling missing values

# Drop rows with any NaN values
df_dropna = df.dropna()
print("\nDataFrame after dropping rows with NaN values:")
print(df_dropna)

# Drop rows where all elements are NaN
df_dropna_all = df.dropna(how='all')
print("\nDataFrame after dropping rows where all elements are NaN:")
print(df_dropna_all)

# Fill NaN values with a specific value
df_fillna = df.fillna(0)
print("\nDataFrame after filling NaN values with 0:")
print(df_fillna)

# Fill NaN values with the mean of the column
df_fillna_mean = df.copy()
for column in df_fillna_mean:
    if pd.api.types.is_numeric_dtype(df_fillna_mean[column]):
        df_fillna_mean[column].fillna(df_fillna_mean[column].mean(), inplace=True)
print("\nDataFrame after filling NaN values with column means:")
print(df_fillna_mean)



Original DataFrame:
     A      B    C    D
0  1.0    one  NaN  1.0
1  2.0    two  2.0  2.0
2  NaN  three  3.0  2.0
3  4.0    NaN  4.0  NaN
4  5.0   five  5.0  5.0

DataFrame after dropping rows with NaN values:
     A     B    C    D
1  2.0   two  2.0  2.0
4  5.0  five  5.0  5.0

DataFrame after dropping rows where all elements are NaN:
     A      B    C    D
0  1.0    one  NaN  1.0
1  2.0    two  2.0  2.0
2  NaN  three  3.0  2.0
3  4.0    NaN  4.0  NaN
4  5.0   five  5.0  5.0

DataFrame after filling NaN values with 0:
     A      B    C    D
0  1.0    one  0.0  1.0
1  2.0    two  2.0  2.0
2  0.0  three  3.0  2.0
3  4.0      0  4.0  0.0
4  5.0   five  5.0  5.0

DataFrame after filling NaN values with column means:
     A      B    C    D
0  1.0    one  3.5  1.0
1  2.0    two  2.0  2.0
2  3.0  three  3.0  2.0
3  4.0    NaN  4.0  2.5
4  5.0   five  5.0  5.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_fillna_mean[column].fillna(df_fillna_mean[column].mean(), inplace=True)


In [7]:
# 2. Data Exploration

# Describe the DataFrame
print("\nDescriptive statistics of the DataFrame:")
print(df.describe(include='all'))

# Detect and remove duplicate rows
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after removing duplicate rows:")
print(df_no_duplicates)




Descriptive statistics of the DataFrame:
               A    B         C         D
count   4.000000    4  4.000000  4.000000
unique       NaN    4       NaN       NaN
top          NaN  one       NaN       NaN
freq         NaN    1       NaN       NaN
mean    3.000000  NaN  3.500000  2.500000
std     1.825742  NaN  1.290994  1.732051
min     1.000000  NaN  2.000000  1.000000
25%     1.750000  NaN  2.750000  1.750000
50%     3.000000  NaN  3.500000  2.000000
75%     4.250000  NaN  4.250000  2.750000
max     5.000000  NaN  5.000000  5.000000

DataFrame after removing duplicate rows:
     A      B    C    D
0  1.0    one  NaN  1.0
1  2.0    two  2.0  2.0
2  NaN  three  3.0  2.0
3  4.0    NaN  4.0  NaN
4  5.0   five  5.0  5.0


In [8]:
# 3. Data Imputation

# Impute missing values using forward fill
df_ffill = df.ffill()
print("\nDataFrame after forward fill imputation:")
print(df_ffill)

# Impute missing values using backward fill
df_bfill = df.bfill()
print("\nDataFrame after backward fill imputation:")
print(df_bfill)



DataFrame after forward fill imputation:
     A      B    C    D
0  1.0    one  NaN  1.0
1  2.0    two  2.0  2.0
2  2.0  three  3.0  2.0
3  4.0  three  4.0  2.0
4  5.0   five  5.0  5.0

DataFrame after backward fill imputation:
     A      B    C    D
0  1.0    one  2.0  1.0
1  2.0    two  2.0  2.0
2  4.0  three  3.0  2.0
3  4.0   five  4.0  5.0
4  5.0   five  5.0  5.0
