# 1. Import the libraries

In [66]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# 2. Load the Letter Recognition data.
## Pull data online and rename the columns.

In [67]:
raw_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data',header=None, sep = ',')
columns = ['lettr', 'x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx']
raw_data.columns = columns

# 3. Find the pairs and make them a new data set.
## 3.1 Pair 1: H and K; 

In [68]:
HK = raw_data[raw_data['lettr'].isin(['H', 'K'])]
le = LabelEncoder()
le.fit(HK['lettr'])

# Transform the target column using the LabelEncoder
HK['lettr'] = le.transform(HK['lettr'])
HK = HK.reset_index(drop=True)
HK_X_train, HK_X_test, HK_y_train, HK_y_test = train_test_split(HK.drop('lettr', axis=1), HK['lettr'], test_size=0.1, random_state=514)
# Reset index of HK_X_train
HK_X_train = HK_X_train.reset_index(drop=True)

# Reset index of HK_X_test
HK_X_test = HK_X_test.reset_index(drop=True)

# Reset index of HK_y_train
HK_y_train = HK_y_train.reset_index(drop=True)

# Reset index of HK_y_test
HK_y_test = HK_y_test.reset_index(drop=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HK['lettr'] = le.transform(HK['lettr'])


## 3.2 Pair 2: M and Y;

In [69]:
MY = raw_data[raw_data['lettr'].isin(['M', 'Y'])]
MY = MY.reset_index(drop=True)
MY_X_train, MY_X_test, MY_y_train, MY_y_test = train_test_split(MY.drop('lettr', axis=1), MY['lettr'], test_size=0.1, random_state=514)

## 3.3 Pair 3: X and T;

In [70]:
XT = raw_data[raw_data['lettr'].isin(['X','T'])]
XT = XT.reset_index(drop=True)
XT_X_train, XT_X_test, XT_y_train, XT_y_test = train_test_split(XT.drop('lettr', axis=1), XT['lettr'], test_size=0.1, random_state=514)

# 4. Model Fitting

## 4.1 K-nearest neighbors

### 4.1.1 Preparation

In [75]:
class KNearestNeighbor:

    def __init__(self, k=3):
        self.n_neighbors = k

    def train(self, X, y):
        self.X_train = X
        self.y_train = y

    def euclidean_distance(self, row1, row2):
        return np.sqrt(np.sum((row1 - row2)**2))

    def predict(self, X_test):
        y_pred = []
        
        for i in range(1,len(X_test)):
            distances = []
            
            for j in range(1,len(self.X_train)):
                dist = self.euclidean_distance(X_test.iloc[i, :], self.X_train.iloc[j, :])
                
                distances.append((j, dist))
            
            sorted_distances = sorted(distances, key=lambda x: x[1])
            k_nearest_neighbors = [self.y_train[x[0]] for x in sorted_distances[:self.n_neighbors]]
            most_common_label = max(set(k_nearest_neighbors), key=k_nearest_neighbors.count)
            y_pred.append(most_common_label)
            
        return y_pred
            
KNN = KNearestNeighbor(k=3)
KNN.train(HK_X_train.reset_index(drop=True), HK_y_train.reset_index(drop=True))
HK_y_pred = KNN.predict(HK_X_test)
acc = np.mean(HK_y_pred == HK_y_test)
print('测试集预测准确率：%f' % acc)

## 4.2 Naïve Bayes Classifier

Unnamed: 0,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
count,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0
mean,4.407547,7.084528,5.841509,5.286792,4.092075,6.487547,7.186415,5.281509,4.767547,8.138868,6.038491,8.796981,3.950943,7.869434,3.591698,8.346415
std,1.890093,3.312128,2.19253,2.204606,2.181834,1.802785,1.051167,3.784409,2.238499,1.59077,1.8803,1.59218,1.45989,1.028452,2.22593,1.598714
min,0.0,0.0,1.0,0.0,0.0,2.0,4.0,1.0,0.0,5.0,1.0,4.0,2.0,2.0,0.0,3.0
25%,3.0,5.0,4.0,4.0,2.0,5.0,7.0,3.0,3.0,7.0,5.0,8.0,3.0,8.0,3.0,7.0
50%,4.0,7.0,6.0,6.0,4.0,7.0,7.0,4.0,6.0,7.0,6.0,8.0,3.0,8.0,3.0,8.0
75%,5.0,9.0,7.0,7.0,5.0,8.0,8.0,7.0,7.0,10.0,7.0,10.0,5.0,8.0,5.0,9.0
max,12.0,15.0,12.0,10.0,11.0,11.0,12.0,15.0,9.0,12.0,12.0,13.0,10.0,12.0,11.0,14.0
