In [43]:
import pandas as pd
from sklearn import preprocessing, model_selection, neighbors

## Load the dataset

In [44]:
input_file = "data.csv"
df = pd.read_csv(input_file, header = 0)

In [45]:
dataset = df.values
X = dataset[:,1:7]
y = dataset[:,7]
y = y.astype('int')

In [46]:
sc = preprocessing.StandardScaler()
X = sc.fit_transform(X)

## Apply K Nearest Neighbor

In [47]:
score_train = 0
score_cross = 0
for i in range(1000):
    X_train, X_cross, y_train, y_cross = model_selection.train_test_split(X, y, train_size=.8, random_state=i)
    neigh = neighbors.KNeighborsClassifier(n_neighbors=3, p=1)
    neigh.fit(X_train, y_train)
    score_train = score_train + neigh.score(X_train, y_train)
    score_cross = score_cross + neigh.score(X_cross, y_cross)
score_train = score_train / 10
print('Train Accuracy: %.2f' % score_train)
score_cross = score_cross / 10
print('Cross Accuracy: %.2f' % score_cross)

Train Accuracy: 85.00
Cross Accuracy: 69.58


In [48]:
neigh.predict_proba(X_cross)

array([[0.66666667, 0.33333333],
       [0.66666667, 0.33333333],
       [0.33333333, 0.66666667],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.66666667, 0.33333333],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.66666667, 0.33333333],
       [0.66666667, 0.33333333],
       [1.        , 0.        ]])

## Analyze errors

In [49]:
X_train, X_cross, y_train, y_cross = model_selection.train_test_split(X, y, train_size=.8, random_state=1)
neigh = neighbors.KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
neigh.score(X_cross, y_cross)

0.6363636363636364

In [50]:
print('Actual   :', y_cross)
print('Predicted:', neigh.predict(X_cross))

Actual   : [1 0 0 1 0 0 0 1 0 1 1]
Predicted: [0 0 1 1 0 0 0 1 0 0 0]


In [51]:
def find_video(x_values):
    for v in range(len(X)):
        if (X[v, :] == x_values).all():
            return df.at[v, 'File name']
    print('No such video', x_values)

In [52]:
predict_cross = neigh.predict(X_cross)
failures = []
for i in range(len(X_cross)):
    if predict_cross[i] != y_cross[i]:
        failures.append(find_video(X_cross[i]))
print(failures)

['flame-spray-7.avi', 'flame-spray-45.avi', 'flame-spray-50.avi', 'flame-spray-20.avi']


## Apply Radius Neighbor 

In [53]:
for i in range(1000):
    X_train, X_cross, y_train, y_cross = model_selection.train_test_split(X, y, train_size=.8, random_state=i)
    neigh = neighbors.RadiusNeighborsClassifier(radius=2.4)
    neigh.fit(X_train, y_train)
    score_train = score_train + neigh.score(X_train, y_train)
    score_cross = score_cross + neigh.score(X_cross, y_cross)
score_train = score_train / 10
print('Train Accuracy: %.2f' % score_train)
score_cross = score_cross / 10
print('Cross Accuracy: %.2f' % score_cross)

Train Accuracy: 78.88
Cross Accuracy: 65.22


## KNN with weights based on random forest

In [62]:
fs = [0.15940957, 0.13788721, 0.17424087, 0.16455568, 0.17854944, 0.18535723]
X = X * fs

In [63]:
score_train = 0
score_cross = 0
for i in range(1000):
    X_train, X_cross, y_train, y_cross = model_selection.train_test_split(X, y, train_size=.8, random_state=i)
    neigh = neighbors.KNeighborsClassifier(n_neighbors=3, p=1)
    neigh.fit(X_train, y_train)
    score_train = score_train + neigh.score(X_train, y_train)
    score_cross = score_cross + neigh.score(X_cross, y_cross)
score_train = score_train / 10
print('Train Accuracy: %.2f' % score_train)
score_cross = score_cross / 10
print('Cross Accuracy: %.2f' % score_cross)



Train Accuracy: 86.18
Cross Accuracy: 70.55
