<a href="https://colab.research.google.com/github/Setudo/ML-CWK2/blob/main/MLCWK2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Libraries

In [75]:
# Data and Datasets
import pandas as pd
from sklearn.datasets import load_wine

# Validation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# Metrics
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Plotting & utils
import pprint
import matplotlib.pyplot as plt
import numpy as np
from time import time

# For visualising decision trees
from sklearn.tree import plot_tree
from IPython.display import display
from IPython.display import SVG
from graphviz import Source
from sklearn.tree import export_graphviz

# Stats
from scipy.stats import randint as sp_randint
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import shapiro     # Shapiro Wilk
from scipy.stats import normaltest  # D’Agostino’s K^2
from scipy.stats import anderson    # Anderson-Darling
from scipy.stats import ttest_ind    # independent student t-test; assumes normality
from scipy.stats import mannwhitneyu # non-parametric; doesn't assume normality

# Missing Data and Scaling
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

In [76]:
# Loading datasets

import pandas as pd

weather_dataset = "/data/weather_dataset.csv"
diabetes_dataset = "/data/diabetes_dataset_100k.csv"

dataset = weather_dataset

df = pd.read_csv(dataset) #Loads the dataset

df.head()

Unnamed: 0,temperature,humidity,wind_speed,precipitation,cloud_cover,atmospheric_pressure,season,visibility,location,weather
0,14.0,73.0,9.5,82.0,partly cloudy,1010.82,Winter,3.5,inland,Rainy
1,39.0,96.0,8.5,71.0,partly cloudy,1011.43,Spring,10.0,inland,Cloudy
2,,0.0,7.0,,clear,1018.72,Spring,5.5,mountain,Sunny
3,38.0,83.0,1.5,82.0,clear,1026.25,Spring,1.0,coastal,Sunny
4,,74.0,17.0,,overcast,990.67,Winter,2.5,mountain,Rainy


In [78]:
# Pre-preprocessing (converting non-numerical data)

nominal_cols = df.select_dtypes(include=["object"]).columns

label_encoders = {}
for col in nominal_cols:
  le = LabelEncoder()
  df[col] = df[col].fillna("Unknown")
  df[col] = le.fit_transform(df[col])

interval_cols = df.select_dtypes(include=["float64", "int64"]).columns

if dataset == diabetes_dataset:
  interval_cols = interval_cols.drop("year")
if dataset == weather_dataset:
  interval_cols = interval_cols.drop("weather")

scaler = RobustScaler()
scaled_interval_data = scaler.fit_transform(df[interval_cols])

In [79]:
# Missing Data Handling

neighbors = 5 ##########################CHANGE THIS SHIT AND TEST

imputer = KNNImputer(n_neighbors=neighbors)
imputed_data = imputer.fit_transform(scaled_interval_data)
df[interval_cols] = imputed_data

In [81]:
# Handling Noise (capping outliers)

def cap_outliers(df, interval_cols):
  lower_percentile = 1 #first percentile
  upper_percentile = 95 #95th percentile
  for col in interval_cols:
    lower_cap = df[col].quantile(lower_percentile / 100)
    upper_cap = df[col].quantile(upper_percentile / 100)
    df[col] = df[col].clip(lower=lower_cap, upper=upper_cap)
  return df

df = cap_outliers(df, interval_cols)

In [82]:
# Implementing the random forest model

if dataset == diabetes_dataset:
  X = df.drop(columns=["diabetes"])
  y = df["diabetes"]
elif dataset == weather_dataset:
  X = df.drop(columns="weather")
  y = df["weather"]

model = RandomForestClassifier(
    n_estimators = 100, #Number of trees the model generates
    max_depth = None, #Can cap the extent to which trees grow
    random_state = 42,
    class_weight = "balanced"
)


In [83]:
# Testing Accuracy of model with stratified k-fold

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=stratified_kfold, scoring="accuracy")

print(f"Cross-Validated Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")

Cross-Validated Accuracy Scores: [0.8875     0.90075758 0.8875     0.89583333 0.88787879]
Mean Accuracy: 0.8919
Standard Deviation: 0.0055
