In [1]:
# You can run 'main.py' to gain the same output
# Install dependencies using 'pip install -r requirements.txt'

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

In [3]:
# Load the dataset
data = pd.read_csv('/home/nictheboy/Documents/college-ids-hw/hw2/data.csv')
data

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
...,...,...,...,...,...,...,...,...
435,1,3,29703,12051,16027,13135,182,2204
436,1,3,39228,1431,764,4510,93,2346
437,2,3,14531,15488,30243,437,14841,1867
438,1,3,10290,1981,2232,1038,168,2125


In [4]:
# Define features (X) and target (y)
X = data[['Region', 'Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen']]
y = data['Channel']

In [5]:
X

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,3,12669,9656,7561,214,2674,1338
1,3,7057,9810,9568,1762,3293,1776
2,3,6353,8808,7684,2405,3516,7844
3,3,13265,1196,4221,6404,507,1788
4,3,22615,5410,7198,3915,1777,5185
...,...,...,...,...,...,...,...
435,3,29703,12051,16027,13135,182,2204
436,3,39228,1431,764,4510,93,2346
437,3,14531,15488,30243,437,14841,1867
438,3,10290,1981,2232,1038,168,2125


In [6]:
y

0      2
1      2
2      2
3      1
4      2
      ..
435    1
436    1
437    2
438    1
439    1
Name: Channel, Length: 440, dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Initialize classifiers
classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'Support Vector Machine': SVC(class_weight='balanced')
}

In [10]:
# Train and evaluate classifiers
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Classifier: {name}")
    print(classification_report(y_test, y_pred))

Classifier: Decision Tree
              precision    recall  f1-score   support

           1       0.96      0.87      0.92       151
           2       0.77      0.93      0.84        69

    accuracy                           0.89       220
   macro avg       0.87      0.90      0.88       220
weighted avg       0.90      0.89      0.89       220

Classifier: K-Nearest Neighbors
              precision    recall  f1-score   support

           1       0.95      0.93      0.94       151
           2       0.86      0.88      0.87        69

    accuracy                           0.92       220
   macro avg       0.90      0.91      0.91       220
weighted avg       0.92      0.92      0.92       220

Classifier: Logistic Regression
              precision    recall  f1-score   support

           1       0.96      0.89      0.93       151
           2       0.80      0.93      0.86        69

    accuracy                           0.90       220
   macro avg       0.88      0.91     

In [11]:
# It can be seen that all 4 algorithms have same performance.

In [12]:
# Initialize clustering algorithms
clustering_algorithms = {
    'K-Means': KMeans(n_clusters=2),
    'Gaussian Mixture': GaussianMixture(n_components=2)
}

In [13]:
# Cluster the data
for name, algo in clustering_algorithms.items():
    algo.fit(X)
    y_pred = algo.predict(X)
    print(f"Clustering Algorithm: {name}")
    print(f"Silhouette Score: {silhouette_score(X, y_pred)}")

Clustering Algorithm: K-Means
Silhouette Score: 0.450871622249804
Clustering Algorithm: Gaussian Mixture
Silhouette Score: 0.4017942090277863


In [14]:
# K-Means works better here.