## Drop Features

In [None]:
yourData.drop(columns=['feature1', '...'], axis=1, inplace=True)

## Missing Value

**Distribute the feature (numeric)**

In [None]:
plt.figure(figsize=(10, 5)) # You can adjust size of your plot here

sns.kdeplot(x=yourData['featureNumeric']) # You can use histplot or boxplot too

plt.tight_layout()
plt.show()

**Distribute the feature (categoric)**

In [None]:
plt.figure(figsize=(10, 5)) # You can adjust size of your plot here

sns.countplot(x=yourData['featureNumeric'])

plt.tight_layout()
plt.show()

**See aggregation**

In [None]:
yourData.groupby(['feature1', '...']).agg({'featureMissVal' : ['agg1', '...']}).reset_index()

**Fill without segmentation**

In [None]:
yourData['featureMissVal'].fillna('usingWhat', inplace=True)

**Fill with segmentation**

In [None]:
yourData = yourData.groupby(['feature1', '...'])['featureMissVal'].apply(lambda x: x.fillna(x.median())) # You can adjust to use median, mean, or anything you want

## Duplicated Data

In [None]:
yourData.drop_duplicates(keep='first', inplace=True)

## Outliers

**Checking outliers**

In [None]:
plt.figure(figsize=(10, 5))

for i in range(len(featureNumeric.columns)) :
    plt.subplot(row, cols, i + 1) # You can adjust how many row and cols in your dataset
    plt.title('{}'.format(featureNumeric.columns[i]))
    sns.boxplot(x=yourData[featureNumeric.columns[i]])

plt.tight_layout()
plt.show()

**Handle it with IQR**

In [None]:
outliers = ['feature1', '...']

for i in outliers :
    Q1 = yourData[i].quantile(0.25)
    Q3 = yourData[i].quantile(0.75)
    IQR = Q3 - Q1
    low_limit = Q1 - (1.5 * IQR)
    high_limit = Q3 + (1.5 * IQR)
    yourData = yourData[(yourData[i] >= low_limit) & (yourData[i] <= high_limit)]

yourData.head()

## Skewness & Normalize

**Checking skew**

In [None]:
plt.figure(figsize=(10, 5))

for i in range(len(featureNumeric.columns)) :
    plt.subplot(row, cols, i + 1) # You can adjust how many row and cols in your dataset
    plt.title('{}'.format(featureNumeric.columns[i]))
    sns.histplot(x=yourData[featureNumeric.columns[i]]) # You can use kdeplot too

plt.tight_layout()
plt.show()

**Handle it**

In [None]:
from sklearn.preprocessing import MinMaxScaler

# You can use anything to handle it based on your data (log, cbrt, sqrt, std, etc...)
data_cube = ['feature1', '...']
data_normalize = ['feature1', '...']

for j in data_cube :
    yourData[j] = np.cbrt(yourData[j])

for x in data_normalize :
    yourData[x] = MinMaxScaler().fit_transform(yourData[x].values.reshape(len(yourData), 1))

yourData.head()

## Encode

In [None]:
from sklearn.preprocessing import LabelEncoder

dataLabel = ['feature1', '...']
for i in dataLabel :
    yourData[i] = LabelEncoder().fit_transform(yourData[i]) # You can use replace or map to manually value setup

yourData = pd.get_dummies(data=yourData, columns=['feature1', '...'])

yourData.head()

## Balancing Data

In [None]:
from imblearn import over_sampling # You can choose for over or under sampling based on your data

X = yourData[[i for i in yourData if i not in ['Label']]].values
y = yourData['Label'].values

X_over, y_over = over_sampling.RandomOverSampler(0.8).fit_resample(X, y)

yourData = pd.DataFrame(X_over, columns=yourData.columns['synchronizeYourLabelIndexLocation'])
yourData['Label'] = y_over

yourData.head()