<a href="https://colab.research.google.com/github/Qu1nnD/CS290/blob/main/notebooks/Neighbors%20Classifier_penguins.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")

In [3]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


In [4]:
penguins.dropna(inplace=True)
penguins['species'] = penguins['species']
X = penguins[['year', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = penguins['species']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
def KNN(X_train, y_train, X_test, k=3):
  predictions = []
  for x in X_test:
    distances = np.linalg.norm(X_train - x, axis=1)
    k_indices = np.argsort(distances)[:k]
    k_nearest_labels = y_train[k_indices]
    most_common = Counter(k_nearest_labels).most_common(1)
    predictions.append(most_common[0][0])
  return np.array(predictions)

In [7]:
y_train = y_train.to_numpy()
k = 3
knn_predictions = KNN(X_train, y_train, X_test, k)
print("KNN Classifier Performance:")
print(classification_report(y_test, knn_predictions))

KNN Classifier Performance:
              precision    recall  f1-score   support

      Adelie       0.94      1.00      0.97        31
   Chinstrap       1.00      0.89      0.94        18
      Gentoo       1.00      1.00      1.00        18

    accuracy                           0.97        67
   macro avg       0.98      0.96      0.97        67
weighted avg       0.97      0.97      0.97        67



In [8]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg_predictions = log_reg.predict(X_test)
print("Logistic Regression Performance:")
print(classification_report(y_test, log_reg_predictions))

Logistic Regression Performance:
              precision    recall  f1-score   support

      Adelie       0.97      1.00      0.98        31
   Chinstrap       1.00      0.94      0.97        18
      Gentoo       1.00      1.00      1.00        18

    accuracy                           0.99        67
   macro avg       0.99      0.98      0.99        67
weighted avg       0.99      0.99      0.98        67



In [9]:
svm = SVC()
svm.fit(X_train, y_train)
svm_predictions = svm.predict(X_test)
print("SVM Performance:")
print(classification_report(y_test, svm_predictions))

SVM Performance:
              precision    recall  f1-score   support

      Adelie       0.97      1.00      0.98        31
   Chinstrap       1.00      0.94      0.97        18
      Gentoo       1.00      1.00      1.00        18

    accuracy                           0.99        67
   macro avg       0.99      0.98      0.99        67
weighted avg       0.99      0.99      0.98        67



In [10]:
softmax_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs')
softmax_reg.fit(X_train, y_train)
softmax_predictions = softmax_reg.predict(X_test)
print("Softmax Regression Performance:")
print(classification_report(y_test, softmax_predictions))

Softmax Regression Performance:
              precision    recall  f1-score   support

      Adelie       0.97      1.00      0.98        31
   Chinstrap       1.00      0.94      0.97        18
      Gentoo       1.00      1.00      1.00        18

    accuracy                           0.99        67
   macro avg       0.99      0.98      0.99        67
weighted avg       0.99      0.99      0.98        67





In [11]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
print("Unique classes in y_train_encoded:", np.unique(y_train_encoded))
sgd_classifier = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3)
sgd_classifier.fit(X_train, y_train_encoded)
sgd_predictions = sgd_classifier.predict(X_test)
print("SGD Classifier Performance:")
print("Accuracy score:" , accuracy_score( y_test_encoded, sgd_predictions))

Unique classes in y_train_encoded: [0 1 2]
SGD Classifier Performance:
Accuracy score: 0.9850746268656716
