# 1. Introduction

In [1]:
print("Welcome to the Machine Learning Preprocessing and Modeling Demonstration!")

Welcome to the Machine Learning Preprocessing and Modeling Demonstration!


# 2. Data Loading and Imputation of Missing Values

In [8]:
from src.loader import load_data
df_breast_cancer = load_data("../assets/breast_cancer_wisconsin_original/breast-cancer-wisconsin.csv")
df_breast_cancer.head()

Unnamed: 0,sample_code_number,clump_thickness,uni_cell_size,uni_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [12]:
from src.preprocessing import DataPipeline
import pandas as pd

df = pd.DataFrame({
    'A': [1, 2, 3, '?', 5],
    'B': ['Low', 'Medium', 'High', 'Low', 'Medium'],
    'C': ['Red', 'Green', 'Red', 'Green', 'Red'],
    'D': [0.5, 0.7, 0.2, 0.4, 0.9],
})
pipeline_imputation = DataPipeline([{"operation": "fill_missing_values"}])
df = pipeline_imputation.run(df)
df.head()

Unnamed: 0,A,B,C,D
0,1.0,Low,Red,0.5
1,2.0,Medium,Green,0.7
2,3.0,High,Red,0.2
3,2.75,Low,Green,0.4
4,5.0,Medium,Red,0.9


# 3. One-Hot Encoding for Categorical Variables

In [13]:
df_forest_fires = load_data("../assets/forest_fires/forestfires.csv")
steps_encoding = [
    {"operation": "one_hot_encode", "params": {"column": "month"}},
    {"operation": "one_hot_encode", "params": {"column": "day"}}
]
pipeline_encoding = DataPipeline(steps_encoding)
df_forest_fires = pipeline_encoding.run(df_forest_fires)
df_forest_fires.head()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,month_may,month_nov,month_oct,month_sep,day_mon,day_sat,day_sun,day_thu,day_tue,day_wed
0,7,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,False,False,False,False,False,False,False,False,False,False
1,7,4,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,False,False,True,False,False,False,False,False,True,False
2,7,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,False,False,True,False,False,True,False,False,False,False
3,8,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,False,False,False,False,False,False,False,False,False,False
4,8,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,False,False,False,False,False,False,True,False,False,False


# 4. Discretization & Standardization

In [14]:
# Cell Code:
steps_standardization = [
    {"operation": "standardize", "params": {"column": "FFMC"}}
]
pipeline_standardization = DataPipeline(steps_standardization)
df_forest_fires = pipeline_standardization.run(df_forest_fires)
df_forest_fires["FFMC"].head()

0   -0.805180
1   -0.008094
2   -0.008094
3    0.191177
4   -0.243597
Name: FFMC, dtype: float64

# 5. Cross-Validation with k=10

In [20]:
from src.cross_validation import k_fold_split, train_test_split
from src.models import NullModel
from src.evaluation_metrics import accuracy

model_classification = NullModel(task="classification")
df_breast_cancer = df_breast_cancer.rename(columns={"class": "target"})

scores = []
# alter stratified here
for train_fold, dev_fold in k_fold_split(df_breast_cancer, n_splits=10, stratified=False):
    model_classification.fit(train_fold.drop("target", axis=1), train_fold["target"])
    preds = model_classification.predict(dev_fold.drop("target", axis=1))
    score = accuracy(dev_fold["target"], preds)
    scores.append(score)

scores


[0.6521739130434783,
 0.7101449275362319,
 0.6666666666666666,
 0.7101449275362319,
 0.5652173913043478,
 0.6376811594202898,
 0.6956521739130435,
 0.6231884057971014,
 0.6376811594202898,
 0.6956521739130435]

# 6. K x 2 Cross-Validation with k=5

In [21]:
scores_kx2 = []
for _ in range(5):
    # Splitting the data into (training + dev) and test datasets
    train_dev_df, test_df = train_test_split(df_breast_cancer, test_size=0.2, stratified=True)
    for train_fold, dev_fold in k_fold_split(train_dev_df, n_splits=2, stratified=True): 
        model_classification.fit(train_fold.drop("target", axis=1), train_fold["target"])
        preds = model_classification.predict(dev_fold.drop("target", axis=1))
        score = accuracy(dev_fold["target"], preds)
        scores_kx2.append(score)

scores_kx2


[0.6726618705035972,
 0.6726618705035972,
 0.6402877697841727,
 0.6402877697841727,
 0.6546762589928058,
 0.6546762589928058,
 0.658273381294964,
 0.658273381294964,
 0.6618705035971223,
 0.6618705035971223]

# 7. Evaluation Metrics

In [22]:
from src.evaluation_metrics import mean_squared_error

model_regression = NullModel(task="regression")
df_forest_fires = df_forest_fires.rename(columns={"area": "target"})
scores_mse = []

for train_fold, dev_fold in k_fold_split(df_forest_fires):
    model_regression.fit(train_fold.drop("target", axis=1), train_fold["target"])
    preds = model_regression.predict(dev_fold.drop("target", axis=1))
    score = mean_squared_error(dev_fold["target"], preds)
    scores_mse.append(score)

avg_mse = sum(scores_mse) / len(scores_mse)
avg_mse

4077.112420824459

# 8. Conclusion

In [23]:
print("Thank you for watching the demonstration!")

Thank you for watching the demonstration!
