In [1]:
import pandas as pd
import numpy as np
!pip install sk

# Deletion

In [2]:
# create a dataframe with missing values
df = pd.DataFrame({
    'col1': [1, 2, np.nan, 4, 5],
    'col2': [6, np.nan, 8, 9, 10],
    'col3': [11, 12, 13, np.nan, 15]
})

# display the dataframe
print("Original data:\n", df)

# List-wise deletion
new_df = df.dropna(axis=0)
print("\nDataframe after list-wise deletion:\n", new_df)

# Pair-wise deletion
new_df = df.dropna(subset=['col1', 'col2'])
print("\nDataframe after pair-wise deletion:\n", new_df)

# Column-wise deletion
new_df = df.dropna(axis=1)
print("\nDataframe after column-wise deletion:\n", new_df)

Original data:
    col1  col2  col3
0   1.0   6.0  11.0
1   2.0   NaN  12.0
2   NaN   8.0  13.0
3   4.0   9.0   NaN
4   5.0  10.0  15.0

Dataframe after list-wise deletion:
    col1  col2  col3
0   1.0   6.0  11.0
4   5.0  10.0  15.0

Dataframe after pair-wise deletion:
    col1  col2  col3
0   1.0   6.0  11.0
3   4.0   9.0   NaN
4   5.0  10.0  15.0

Dataframe after column-wise deletion:
 Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


# Imputation

In [3]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

# create a dataframe with missing values
df = pd.DataFrame({
    'col1': [1, 2, np.nan, 4, 5],
    'col2': [6, np.nan, 8, 9, 10],
    'col3': [11, 12, 13, np.nan, 15]
})

# display the dataframe
print("Original data:\n", df)

# Mean Imputation
new_df = df.copy()
mean_value = new_df['col1'].mean()
new_df['col1'].fillna(value=mean_value, inplace=True)
print("\nMean Imputation:\n", new_df)

# Median Imputation
new_df = df.copy()
median_value = new_df['col1'].median()
new_df['col1'].fillna(value=median_value, inplace=True)
print("\nMedian Imputation:\n", new_df)

# Mode Imputation
new_df = df.copy()
mode_value = new_df['col1'].mode()[0]
new_df['col1'].fillna(value=mode_value, inplace=True)
print("\nMode Imputation:\n", new_df)

# Regression Imputation
new_df = df.copy()
model = LinearRegression()
x_train = new_df.dropna()[['col2', 'col3']]  # data points without missing values
y_train = new_df.dropna()['col1']  # target variable without missing values
model.fit(x_train, y_train)
x_test = new_df[new_df['col1'].isna()][['col2', 'col3']]  # data points with missing values
new_df.loc[df['col1'].isnull(), 'col1'] = model.predict(x_test)
print("\nRegression Imputation:\n", new_df)

# Hot-Deck Imputation
new_df = df.copy()
missing_index = np.where(new_df['col1'].isnull())[0]
for i in missing_index:
    new_df.iloc[i, 0] = new_df.iloc[i-1, 0]  # fill missing values with value of the previous observation
print("\nHot-Deck Imputation:\n", new_df)

# K-Nearest Neighbors (KNN) imputation
imputer = KNNImputer(n_neighbors=2)
df_impute_knn = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
print("\nK-Nearest Neighbors (KNN) imputation:\n", df_impute_knn)

ModuleNotFoundError: No module named 'sklearn'

In [7]:
#!pip install sklearn

In [6]:
!pip install scikit-learn

