<h1>1. Exploratory Data Analysis</h1>

In [31]:
# set matplotlib backend to inline
%matplotlib inline 

# import modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.datasets import load_wine
import matplotlib.pyplot as plt

# load data
wine= load_wine()
#print(wine.DESCR)
# this dataset has 13 features, we will only choose a subset of these
df_wine = pd.DataFrame(wine.data, columns = wine.feature_names )

# extract the data as arrays of features, X, and target, y
X = df_wine.values
y = pd.DataFrame(wine, columns=["target"])
X_scaled = None

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0.98  3.88    2.29  0.63
    Fl

<h2>1.1 Scaling features</h2>

In [13]:
# 1. Summary Statistics of Each Feature
df = pd.DataFrame(X, columns=wine.feature_names)
print("Summary Statistics of Each Feature:")
summary_statistics = df.describe()
summary_statistics

Summary Statistics of Each Feature:


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [28]:
# Check if features need scaling based on the range of values
feature_ranges = summary_statistics.loc[['max']].T
features_to_scale = feature_ranges[feature_ranges['max'] > 1].index.tolist()


if features_to_scale:
    print(f"\nFeatures that need scaling: {features_to_scale}")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    print("\nFeatures after scaling:")
else:
    print("\nFeatures do not need scaling.")


Features that need scaling: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']

Features after scaling:


<h2>1.2 Balancing</h2>

In [33]:
class_distribution = y.value_counts()
print(class_distribution)

target
1         71
0         59
2         48
Name: count, dtype: int64


<h2>1.3 Choosing validation method</h2>

In [73]:
class KNNClassifier:
    def __init__(self, k=3, p=2, weighting="uniform"):
        """
        KNNClassifier initialization.
        
        Parameters:
        - k: Number of neighbors to consider (default is 3).
        - p: Minkowski distance parameter (default is 2 for Euclidean distance).
        - weighting: Method for weighting votes ('uniform' or 'distance', default is 'uniform').
        """
        self.k = k
        self.p = p
        self.X_train = None
        self.y_train = None
        if not (weighting in ["uniform", "distance"]):
            raise ValueError("Unsupported weighting method.")
        self.weighting = weighting

    def fit(self, X_train, y_train):
        """
        Fit the model with training data.
        
        Parameters:
        - X_train: Training features.
        - y_train: Training labels.
        """
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)

    def predict(self, X_test):
        """
        Predict labels for test data.
        
        Parameters:
        - X_test: Test features.
        
        Returns:
        - predictions: Predicted labels.
        """
        predictions = []
        for x_test in X_test:
            distances = self.calculate_distances(x_test)
            sorted_indices = np.argsort(distances)

            k_nearest_labels = self.y_train[sorted_indices[:self.k]]

            if self.weighting == 'uniform':
                # Uniform weighting
                most_common_label = np.argmax(np.bincount(k_nearest_labels))
            else:
                # Weighted by inverse distances
                weighted_votes = 1 / (distances[sorted_indices[:self.k]] + 1e-10)
                weighted_counts = np.bincount(k_nearest_labels, weights=weighted_votes)
                most_common_label = np.argmax(weighted_counts)

            predictions.append(most_common_label)
        return predictions

    def calculate_distances(self, x):
        """
        Calculate distances between a test point and all training points.
        
        Parameters:
        - x: Test point.
        
        Returns:
        - distances: Array of distances.
        """
        distances = np.linalg.norm(self.X_train - x, ord=self.p, axis=1)
        return distances


In [81]:
# KNN Classifier
knn_model = KNNClassifier(k=5, weighting="distance")  # You can experiment with different k values
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)

# Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

In [82]:
# Model Evaluation
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    conf_matrix = confusion_matrix(y_test, predictions)
    
    return accuracy, precision, recall, conf_matrix

In [83]:
# Evaluate KNN Classifier
knn_accuracy, knn_precision, knn_recall, knn_conf_matrix = evaluate_model(knn_model, X_test, y_test)

# Evaluate Decision Tree Classifier
dt_accuracy, dt_precision, dt_recall, dt_conf_matrix = evaluate_model(dt_model, X_test, y_test)

In [84]:
# Print results
print("KNN Classifier:")
print(f"Accuracy: {knn_accuracy}")
print(f"Precision: {knn_precision}")
print(f"Recall: {knn_recall}")
print("Confusion Matrix:")
print(knn_conf_matrix)

print("\nDecision Tree Classifier:")
print(f"Accuracy: {dt_accuracy}")
print(f"Precision: {dt_precision}")
print(f"Recall: {dt_recall}")
print("Confusion Matrix:")
print(dt_conf_matrix)

KNN Classifier:
Accuracy: 0.8055555555555556
Precision: 0.8157644824311492
Recall: 0.8055555555555556
Confusion Matrix:
[[12  0  2]
 [ 0 12  2]
 [ 1  2  5]]

Decision Tree Classifier:
Accuracy: 0.9444444444444444
Precision: 0.9462962962962962
Recall: 0.9444444444444444
Confusion Matrix:
[[13  1  0]
 [ 0 14  0]
 [ 1  0  7]]
