---

Load libraries

---

In [None]:
## Load libraries
import pandas as pd
import numpy as np
import sys
import math
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline

# Pipeline module
from sklearn.pipeline import Pipeline

# Scaling, encoding, and imputation modules
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

# Column transformation modules
from sklearn.compose import ColumnTransformer

# Module for categorical variables
from pandas.api.types import CategoricalDtype

# Modules for building custom encoders and transformers
from sklearn.base import BaseEstimator, TransformerMixin

# Regression and classification modules
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

---

Mount Google Drive if running in Colab

---

In [None]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    # Change path below starting from /content/drive/MyDrive/Colab Notebooks/
    # depending on how data is organized inside your Colab Notebooks folder in
    # Google Drive
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/EvenSem2024MAHE'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

---

Create lists of ordinal, categorical, and continuous features

---

In [None]:
## Create lists of ordinal, categorical, and continuous features
ordinal_features = ['Rating']
categorical_features = ['Gender']
continuous_features = ['Age']

---

User-defined function to load the movie ratings dataset and assign 'category' datatype to ordinal and categorical columns

---

In [None]:
def load_data():
  ## Load movie ratings data
  file = DATA_DIR+'ratings.csv'
  df = pd.read_csv(file, sep = ',', header = 0, index_col = 0)
  df[ordinal_features + categorical_features] = df[ordinal_features + categorical_features].astype('category')
  return(df)

In [None]:
## Load movie ratings data
df = load_data()
print('Movie ratings dataset')
print('-----------')
print('Initial number of samples = %d'%(df.shape[0]))
print('Initial number of features = %d\n'%(df.shape[1]))
df.head(5)

---

Plot percentage of missing values (NaNs) for each feature

---

In [None]:
## Plot percentage of missing values (NaNs) for each feature
cutoff = 10 # we will remove features missing in more than 20% of the samples
fig = plt.figure(figsize=(6, 4))
percent_missing = (df.isna().sum() / df.shape[0]) * 100
percent_missing.plot(kind = 'bar', color = cm.rainbow(np.linspace(0, 1, 2))[(percent_missing <= cutoff).values.astype(int)])
fig.suptitle('Percentage Missing Values Across All Features', fontsize = 12)
plt.xlabel('Feature', fontsize = 12)
plt.ylabel('% Missing Values', fontsize = 12);

---

Print unique values in each ordinal and categorical features

---

In [None]:
## Print unique values in each ordinal and categorical features
print(df[ordinal_features + categorical_features].nunique())
print('\nUnique values in ordinal and categorical features')
print('---------------------------------------------------')
unique_values = {col:list(df[col].unique()) for col in ordinal_features + categorical_features}
for key, value in unique_values.items():
  print(key, value)

---

Impute ordinal and categorical columns using SimpleImputer()

---

In [None]:
## Impute ordinal and categorical columns using SimpleImputer()
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imputer.fit_transform(df.loc[:, ordinal_features + categorical_features])

Impute ordinal and categorical columns using KNNImputer().

Is there any issue in this approach?

In [None]:
## Impute ordinal and categorical columns using KNNImputer()
imputer = KNNImputer(n_neighbors = 2)
imputer.fit_transform(df.loc[:, ordinal_features + categorical_features])

---

Impute ordinal and categorical columns using IterativeImputer().

Is there any issue in this approach?

---

In [None]:
imputer = IterativeImputer(estimator=RandomForestClassifier(random_state=0), max_iter=20)
imputer.fit_transform(df.loc[:, ordinal_features + categorical_features])

---

Impute continuous column using KNNImputer()

---

In [None]:
## Impute continuous column using KNNImputer()
imputer = KNNImputer(n_neighbors = 2)
imputer.fit_transform(df.loc[:, continuous_features])

---

Impute continuous column using IterativeImputer()

---

In [None]:
imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=0), max_iter=20)
imputer.fit_transform(df.loc[:, continuous_features])

---

One-hot encode the categorical column using OneHotEncoder().

Is there any issue in this approach?

---

In [None]:
## One-hot encode the categorical column using OneHotEncoder()
OneHotEncoder().fit_transform(df.loc[:, categorical_features]).todense()

---

Ordinal encode the ordinal column using OrdinalEncoder().

Is there any issue in this approach?

---

In [None]:
## Ordinal encode the ordinal column using OrdinalEncoder()
ratings_order = ['Bad', 'Neutral', 'Good']
OrdinalEncoder(categories = [ratings_order]).fit_transform(df.loc[:, ordinal_features])

---

ChatGPT prompt and resulting code: I have a movie ratings dataset in a file called ratings.csv. There are 3 features: 1) Rating, an ordinal column with 3 levels Good, Bad, Average (2) Age, a continuous column, and (3) Gender, a categorical column with 3 levels M, F, U. All columns have missing values. Can you write a Python code that will implement a pipeline that will (a) impute and encode the categorical and ordinal features and (b) impute and scale the continuous feature. Note that you have to use either KNNImputer() or IterativeImputer() for imputation purpose and not the SimpleImputer()

---