In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
wine = fetch_ucirepo(id=109) 
  
# data (as pandas dataframes) 
X = wine.data.features 
y = wine.data.targets 
  


{'uci_id': 109, 'name': 'Wine', 'repository_url': 'https://archive.ics.uci.edu/dataset/109/wine', 'data_url': 'https://archive.ics.uci.edu/static/public/109/data.csv', 'abstract': 'Using chemical analysis to determine the origin of wines', 'area': 'Physical Science', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 178, 'num_features': 13, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1992, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C5PC7J', 'creators': ['Stefan Aeberhard', 'M. Forina'], 'intro_paper': {'title': 'Comparative analysis of statistical pattern recognition methods in high dimensional settings', 'authors': 'S. Aeberhard, D. Coomans, O. Vel', 'published_in': 'Pattern Recognition', 'year': 1994, 'url': 'https://www.semanticscholar.org/paper/83dc3e4030d7b9fbdbb4bde03ce12ab70ca10528', 'doi': '

In [5]:
X_train,X_test,y_train,y_test= train_test_split(X,y, test_size=0.3, stratify=y )

In [6]:
print(type(X_train))
print(type(X_test))
print(type(y_train))
print(type(y_test))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [7]:
y_train

Unnamed: 0,class
172,3
24,1
116,2
148,3
60,2
...,...
30,1
99,2
146,3
66,2


In [12]:
len(X_train)

124

In [16]:
def euclidean_distance(p:np.ndarray,q:np.ndarray):
    return np.sqrt(np.sum((p-q)**2))

def pred(x, x_train, y_train, k):
    dist = []
    for i, row in x_train.iterrows():
        dist.append(euclidean_distance(row, x))
    dist2 = np.argsort(dist)[:k]
    k_nearest_labels = []
    for i in dist2:
        k_nearest_labels.append(y_train.iloc[i])

    # Convert the k_nearest_labels list to a flat NumPy array
    k_nearest_labels = np.array(k_nearest_labels).flatten()
    return np.bincount(k_nearest_labels).argmax()


    
def pred_all(test,train,y_train,k):
    pred_todos=[]
    for i, row in test.iterrows():
        pred_todos.append(pred(row,train,y_train,k))
    return pred_todos
#    return np.array(y_pred)
        

In [17]:
ans1=pred_all(X_test,X_train,y_train,5)
ans2=pred_all(X_test,X_train,y_train,3)
ans3=pred_all(X_test,X_train,y_train,1)

In [18]:
y_test=y_test.astype(int)

In [19]:
precision1= skm.precision_score(ans1,y_test,average='weighted')
precision2= skm.precision_score(ans2,y_test,average='weighted')
precision3= skm.precision_score(ans3,y_test,average='weighted')


In [20]:
print("Precision score with k=5:",round(precision1,2)*100,"%")
print("Precision score with k=3:",round(precision2,2)*100,"%")
print("Precision score with k=1:",round(precision3,2)*100,"%")

Precision score with k=5: 81.0 %
Precision score with k=3: 74.0 %
Precision score with k=1: 80.0 %


In [21]:
exactitud=skm.accuracy_score(ans1,y_test)
exactitud2=skm.accuracy_score(ans2,y_test)
exactitud3=skm.accuracy_score(ans3,y_test)

print("Exactitud para k=5",round(exactitud,2)*100,"%")
print("Exactitud para k=3",round(exactitud2,2)*100,"%")
print("Exactitud para k=1",round(exactitud3,2)*100,"%")

Exactitud para k=5 81.0 %
Exactitud para k=3 70.0 %
Exactitud para k=1 78.0 %


In [22]:
sensibilidad=skm.recall_score(ans1,y_test,average='weighted')
sensibilidad2=skm.recall_score(ans2,y_test,average='weighted')
sensibilidad3=skm.recall_score(ans3,y_test,average='weighted')
print("Sensibilidad para k=5",round(sensibilidad,2)*100,"%")
print("Sensibilidad para k=3",round(sensibilidad2,2)*100,"%")
print("Sensibilidad para k=1",round(sensibilidad3,2)*100,"%")

Sensibilidad para k=5 81.0 %
Sensibilidad para k=3 70.0 %
Sensibilidad para k=1 78.0 %


In [23]:
y_train=y_train.astype(int)

In [24]:
from sklearn.neighbors import KNeighborsClassifier as KNN
neigh= KNN(n_neighbors=5)
neigh.fit(X=X_train,y=y_train)

  return self._fit(X, y)


In [25]:
predction=neigh.predict(X_test)

In [26]:
precision_neigh= skm.precision_score(predction,y_test,average='weighted')
print("Precision score with k=5:",round(precision_neigh,4)*100,"%")

Precision score with k=5: 81.33 %


In [27]:
neigh2= KNN(n_neighbors=3)
neigh2.fit(X=X_train,y=y_train)

  return self._fit(X, y)


In [28]:
predction2=neigh.predict(X_test)

In [29]:
precision_neigh2= skm.precision_score(predction2,y_test,average='weighted')
print("Precision score with k=3:",round(precision_neigh2,4)*100,"%")

Precision score with k=3: 81.33 %


In [30]:
neigh3= KNN(n_neighbors=1)
neigh3.fit(X=X_train,y=y_train)

  return self._fit(X, y)


In [31]:
predction3=neigh3.predict(X_test)

In [32]:
precision_neigh3= skm.precision_score(predction3,y_test,average='weighted')
print("Precision score with k=3:",round(precision_neigh3,4)*100,"%")

Precision score with k=3: 79.57 %
