# KNN
## Lectura de datos

In [91]:
import pandas as pd
from pandas import Series,DataFrame

setInicial=pd.read_csv('Vinos.csv')

#Recuperar nombres de las columnas de los atributos y la clase
atributosName=setInicial.columns[:-1]
atributoClase=setInicial.columns[-1]
print(atributoClase)
print(atributosName)

#Datos de los atributos
dataGral=setInicial[atributosName]

#Datos de la clase (vector de elementos reales)
claseGral=setInicial[atributoClase]

#Instancias por clase
print(claseGral.value_counts())


Clase
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')
Clase
Calidad_5    681
Calidad_7    199
Name: count, dtype: int64


## Generar un solo modelo

Implementación del algoritmo

### Funciones de distancia


In [92]:
import math

#Distancia euclideana
def euclidiana(vector_i, vector_j):
  sum_subtotal = 0
  for x_i, x_j in zip(vector_i, vector_j):
      intermediate = math.pow((x_i  - x_j), 2)
      sum_subtotal += intermediate

  result = math.sqrt(sum_subtotal)

  return result



#Distancia Manhattan
def manhattan(vector_i, vector_j):
  total = 0
  for x_i, x_j in zip(vector_i, vector_j):
    total += math.fabs(x_i - x_j)

  return total

#Coseno
def coseno(vector_i, vector_j):
  denominador_i = 0
  denominador_j = 0
  numerador = 0
  for x_i, x_j in zip(vector_i, vector_j):
    numerador += x_i * x_j
    denominador_i += math.pow(x_i, 2)
    denominador_j += math.pow(x_j, 2)

  return (numerador/(math.sqrt(denominador_i)*math.sqrt(denominador_j)))




In [93]:
import random

def KNN(datosEntrenamiento, datoPredecir, metricaDistancia, cantidadVecinos):
  for i in range(len(datoPredecir)):
    if metricaDistancia == 'euclidiana':
      datosEntrenamiento['Distance'] = datosEntrenamiento.apply(lambda row: euclidiana(row[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']], datoPredecir.iloc[i]), axis=1)
    elif metricaDistancia == 'manhattan':
      datosEntrenamiento['Distance'] = datosEntrenamiento.apply(lambda row: manhattan(row[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']], datoPredecir.iloc[i]), axis=1)
    Datos_sorted = datosEntrenamiento.sort_values(by='Distance')

    # Drop the 'Distance' column if needed
    Datos_sorted.drop('Distance', axis=1, inplace=True)

    Vecinos = Datos_sorted.head(cantidadVecinos)

    Cuenta = Vecinos['Clase'].value_counts().get('Calidad_5', 0)    
    
    if Cuenta > cantidadVecinos/2: 
      datoPredecir.loc[i, 'Clase'] = 'Calidad_5'
    elif Cuenta < cantidadVecinos/2:
      datoPredecir.loc[i, 'Clase'] = 'Calidad_7'
    else:
      datoPredecir.loc[i, 'Clase'] = random.choice(['Calidad_5', 'Calidad_7'])
      
  return datoPredecir['Clase']

Entrenamiento

Clasificación (predicción)

In [94]:
setPredic=pd.DataFrame({'fixed acidity':[7.4, 7.5, 7.9],
                        'volatile acidity':[0.5, 0.49, 0.6], 
                        'citric acid':[0, 0.11, 0.21], 
                        'residual sugar':[1.9, 1.8, 2.2],
                        'chlorides':[0.17, 0.066, 0.106], 
                        'free sulfur dioxide':[7, 10, 25], 
                        'total sulfur dioxide':[145, 71, 37], 
                        'density':[0.9966, 0.9964, 0.9978],
                        'pH':[3.11, 3.17, 3.28], 
                        'sulphates':[1.28, 0.83, 1.2], 
                        'alcohol':[9.2, 8, 10]})

#Instancias por clase
print(setPredic)

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.50         0.00             1.9      0.170   
1            7.5              0.49         0.11             1.8      0.066   
2            7.9              0.60         0.21             2.2      0.106   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                    7                   145   0.9966  3.11       1.28   
1                   10                    71   0.9964  3.17       0.83   
2                   25                    37   0.9978  3.28       1.20   

   alcohol  
0      9.2  
1      8.0  
2     10.0  


In [95]:
#Asignar clase
combined_df = pd.concat([dataGral.reset_index(drop=True), claseGral.reset_index(drop=True)], axis=1)

Clases=KNN(combined_df, setPredic, 'euclidiana', 5)
print(Clases)

0    Calidad_5
1    Calidad_5
2    Calidad_7
Name: Clase, dtype: object


In [96]:
combined_df = pd.concat([dataGral.reset_index(drop=True), claseGral.reset_index(drop=True)], axis=1)

predictExp2=KNN(combined_df, dataGral, 'euclidiana', 5)

predictExp2

print(len(claseGral), len(predictExp2))

Aciertos=0
for x in range(len(claseGral)):
    if claseGral[x]==predictExp2[x]:
        Aciertos+=1

print(Aciertos, len(claseGral)-Aciertos)

880 880
785 95


## Separación de conjuntos
Entrenamiento

In [97]:
from sklearn.model_selection import train_test_split

#Separar los datos

valoresTrain, valoresTest, clasesTrain, clasesTest= \
train_test_split(dataGral[dataGral.columns[:-1]], claseGral, test_size=0.30)

print(len(valoresTrain), len(valoresTest))

valoresTrain.head()


616 264
     fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
257           15.5             0.645         0.49             4.2      0.095   
259           15.6             0.645         0.49             4.2      0.095   
688            9.6             0.320         0.47             1.4      0.056   
233           10.6             0.280         0.39            15.5      0.069   
653            6.1             0.705         0.10             2.8      0.081   
..             ...               ...          ...             ...        ...   
648            7.4             0.470         0.46             2.2      0.114   
817            7.6             0.310         0.34             2.5      0.082   
400            7.3             0.490         0.10             2.6      0.068   
532            6.1             0.640         0.02             2.4      0.069   
608            7.5             0.400         0.18             1.6      0.079   

     free sulfur dioxide  total

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
257,15.5,0.645,0.49,4.2,0.095,10.0,23.0,1.00315,2.92,0.74,11.1
259,15.6,0.645,0.49,4.2,0.095,10.0,23.0,1.00315,2.92,0.74,11.1
688,9.6,0.32,0.47,1.4,0.056,9.0,24.0,0.99695,3.22,0.82,10.3
233,10.6,0.28,0.39,15.5,0.069,6.0,23.0,1.0026,3.12,0.66,9.2
653,6.1,0.705,0.1,2.8,0.081,13.0,28.0,0.99631,3.6,0.66,10.2


Clasificación (evaluación)

In [98]:
from sklearn.metrics import accuracy_score

combined_train_df = pd.concat([valoresTrain, clasesTrain], axis=1)

combined_train_df.reset_index(drop=True, inplace=True)


valoresTest.reset_index(drop=True, inplace=True)

predict=KNN(combined_train_df, valoresTest, 'euclidiana', 5)

valoresTrain.reset_index(drop=True, inplace=True)

predictTrain=KNN(combined_train_df, valoresTrain, 'euclidiana', 5)


clasesTest.reset_index(drop=True, inplace=True)
print("Conjunto de evaluación: ",accuracy_score(clasesTest,predict))

clasesTrain.reset_index(drop=True, inplace=True)
print("Conjunto de entrenamiento: ",accuracy_score(clasesTrain,predictTrain))


Conjunto de evaluación:  0.8371212121212122
Conjunto de entrenamiento:  0.8831168831168831


## Overfitting y underfitting

In [99]:
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier

prof=[5,6,7,8,9]

Train=[]
Test=[]

################## Clasificador ######################

################## Modelo ###########################
for x in prof:   
    predict=KNN(combined_train_df, valoresTest, 'euclidiana', x)
    predictTrain=KNN(combined_train_df, valoresTrain, 'euclidiana', x)
    
    Test.append(accuracy_score(clasesTest,predict))
    Train.append(accuracy_score(clasesTrain,predictTrain))

print(Test)
print(Train)

plt.plot(prof, Train, label='Train', color='red', linestyle='dotted')
plt.plot(prof,Test, label='Test', color='green', linestyle='dotted')
plt.legend()


KeyboardInterrupt: 

In [None]:
%%shell
jupyter nbconvert --to html 2_1_Arbol.ipynb