# 1. Introduction

In [None]:
print("Welcome to the Machine Learning Preprocessing and Modeling Demonstration!")

# 2. Data Loading and Imputation of Missing Values

In [None]:
from src.loader import load_data
df_breast_cancer = load_data("../assets/breast_cancer_wisconsin_original/breast-cancer-wisconsin.csv")
df_breast_cancer.head()

In [None]:
from src.preprocessing import DataPipeline
import pandas as pd

df = pd.DataFrame({
    'A': [1, 2, 3, '?', 5],
    'B': ['Low', 'Medium', 'High', 'Low', 'Medium'],
    'C': ['Red', 'Green', 'Red', 'Green', 'Red'],
    'D': [0.5, 0.7, 0.2, 0.4, 0.9],
})
pipeline_imputation = DataPipeline([{"operation": "fill_missing_values"}])
df = pipeline_imputation.run(df)
df.head()

# 3. One-Hot Encoding for Categorical Variables

In [None]:
df_forest_fires = load_data("../assets/forest_fires/forestfires.csv")
steps_encoding = [
    {"operation": "one_hot_encode", "params": {"column": "month"}},
    {"operation": "one_hot_encode", "params": {"column": "day"}}
]
pipeline_encoding = DataPipeline(steps_encoding)
df_forest_fires = pipeline_encoding.run(df_forest_fires)
df_forest_fires.head()

# 4. Discretization & Standardization

In [None]:
steps_standardization = [
    {"operation": "standardize", "params": {"column": "FFMC"}}
]
pipeline_standardization = DataPipeline(steps_standardization)
df_forest_fires = pipeline_standardization.run(df_forest_fires)
df_forest_fires["FFMC"].head()

In [None]:
steps_standardization = [
    {"operation": "discretize_equal_width", "params": {"column": "temp", "bins":5}},
    {"operation": "discretize_equal_frequency", "params": {"column": "ISI", "bins":5}}
]
pipeline_standardization = DataPipeline(steps_standardization)
df_forest_fires = pipeline_standardization.run(df_forest_fires)
df_forest_fires[["temp", "ISI"]].head()

# 5. Cross-Validation with k=10

In [None]:
from src.cross_validation import k_fold_split, train_test_split
from src.models import NullModel
from src.evaluation_metrics import accuracy

model_classification = NullModel(task="classification")
df_breast_cancer = df_breast_cancer.rename(columns={"class": "target"})

scores = []
# alter stratified here
for train_fold, dev_fold in k_fold_split(df_breast_cancer, n_splits=10, stratified=True):
    model_classification.fit(train_fold.drop("target", axis=1), train_fold["target"])
    preds = model_classification.predict(dev_fold.drop("target", axis=1))
    score = accuracy(dev_fold["target"], preds)
    scores.append(score)

scores


# 6. K x 2 Cross-Validation with k=5

In [None]:
scores_kx2 = []
for _ in range(5):
    # Splitting the data into (training + dev) and test datasets
    train_dev_df, test_df = train_test_split(df_breast_cancer, test_size=0.2, stratified=True)
    for train_fold, dev_fold in k_fold_split(train_dev_df, n_splits=2, stratified=True): 
        model_classification.fit(train_fold.drop("target", axis=1), train_fold["target"])
        preds = model_classification.predict(dev_fold.drop("target", axis=1))
        score = accuracy(dev_fold["target"], preds)
        scores_kx2.append(score)

scores_kx2


# 7. Evaluation Metrics

In [None]:
from src.evaluation_metrics import mean_squared_error

model_regression = NullModel(task="regression")
df_forest_fires = df_forest_fires.rename(columns={"area": "target"})
scores_mse = []

for train_fold, dev_fold in k_fold_split(df_forest_fires):
    model_regression.fit(train_fold.drop("target", axis=1), train_fold["target"])
    preds = model_regression.predict(dev_fold.drop("target", axis=1))
    score = mean_squared_error(dev_fold["target"], preds)
    scores_mse.append(score)

avg_mse = sum(scores_mse) / len(scores_mse)
avg_mse

# 8. Conclusion

In [None]:
print("Thank you for watching the demonstration!")