In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [None]:
df = pd.read_csv('datasets\magic04.data')
df.head()

In [None]:
columns = ['fLength','fWidth','fSize','fConc','fConc1','fAsym','fM3Long','fM3Trans','fAlpha','fDist','class']
df = pd.read_csv('datasets\magic04.data', names=columns)
# converting g to 1 and h to 0
df['class'] = (df['class'] == 'g').astype(int)
df.head()

In [None]:
for label in columns[:-1]:
    plt.hist(df[df['class'] == 0][label], color='green', label='gamma', alpha=0.5, density=True)
    plt.hist(df[df['class'] == 1][label], color='blue', label='hadrom', alpha=0.5, density=True)

    plt.title(label)
    plt.ylabel('frequency')
    plt.xlabel(label)
    plt.legend()

    plt.show()

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6 * len(df)), int(0.8 * len(df))])

In [None]:
def rescale_dataset(dataset, oversample=False):
    x = dataset[dataset.columns[:-1]].values
    y = dataset[dataset.columns[-1]].values

    scaler = StandardScaler()
    x = scaler.fit_transform(x)

    if oversample:
        rand = RandomOverSampler()
        x, y = rand.fit_resample(x, y)

    data = np.hstack((x, np.reshape(y, (-1, 1))))

    return data, x, y

In [None]:
train, x_train, y_train = rescale_dataset(train, oversample=True)
valid, x_valid, y_valid = rescale_dataset(valid, oversample=False)
test, x_test, y_test = rescale_dataset(test, oversample=False)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)

In [None]:
y_pred = knn.predict(x_test)
print(classification_report(y_test, y_pred))