In [19]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE


# Look into partial fitting. 
# Dataset is not evenly distributed with most being 1.
# Dummy Classifier has 50 % accuracy score... (Only give instances the most frequent label)


# steps_countvec = scipy.sparse.load_npz('recipe_text_features_countvec/train_steps_countvectorizer.pkl')

# Doc2Vect is a technique to transfer words into numerical representation. 
# https://www.shibumi-ai.com/post/a-gentle-introduction-to-doc2vec
d2v_ingr = pd.read_csv("../data/COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/train_ingr_doc2vec100.csv", header=None)
d2v_name = pd.read_csv("../data/COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/train_name_doc2vec100.csv", header=None)
d2v_steps = pd.read_csv("../data/COMP30027_2021_Project2_datasets/recipe_text_features_doc2vec100/train_steps_doc2vec100.csv", header=None)




# Extract class_labels from training set 
# quick = 1
# medium = 2
# slow = 3
data_train = pd.read_csv('../data/COMP30027_2021_Project2_datasets/recipe_train.csv')

# Obtain the labels
train_label = data_train.iloc[:,-1]

smote = SMOTE(random_state=42)

# Feature selection with f_classif (ANOVA F-value)
# ANOVA f-value shows how well a feature discriminate between classes
# The more discrimination, the better that feature is in predicting the class label.
# d2v_name_new = SelectKBest(k=90).fit_transform(d2v_name, train_label)
# d2v_ingr_new = SelectKBest(k=90).fit_transform(d2v_ingr, train_label)
# d2v_steps_new = SelectKBest(k=90).fit_transform(d2v_steps, train_label)
d2v_name_new = pd.DataFrame(d2v_name)
d2v_ingr_new = pd.DataFrame(d2v_ingr)
d2v_steps_new = pd.DataFrame(d2v_steps)
# print(d2v_ingr_new)
# print(d2v_steps_new)
# print(d2v_name_new.shape)


tot = []
for i, el in enumerate(data_train.iloc[:,1]):
    total = el+data_train.iloc[i,2]
    tot.append(total)
    
    
tot = pd.DataFrame(tot)

# Create a new dataframe of data, but this time, name, steps and ingr has been engineered to have doc2vec features.
# 100 doc2vec features were given, but we selected 20 best features using ANOVA f-value.
f_data = d2v_name_new.join(tot, on=None, how='left', lsuffix='_left', rsuffix='_right')
f_data = f_data.join(d2v_steps_new, on=None, how='left', lsuffix='_left', rsuffix='_right')
f_data = f_data.join(d2v_ingr_new, on=None, how='left', lsuffix='_left', rsuffix='_right')


# Standardise the data so that the mean is 0 
scaler = StandardScaler()
f_data = scaler.fit_transform(f_data)

# normalise all values to be between 0 and 1 for chi2
minmax_scaler = MinMaxScaler()
f_data = minmax_scaler.fit_transform(f_data)
   
# Feature selection for chi2
f_data = SelectKBest(chi2, k=20).fit_transform(f_data, train_label)






# Now we should have 20,246 instances for each class.
# Splitting the provided training into its own train/test
X_train, X_test, y_train, y_test = train_test_split(f_data, train_label, test_size=0.2, stratify=train_label, random_state=42)


# Oversampling, because the distribution of classes in training data is highly skewed towards quick and medium.
oversample = RandomOverSampler(sampling_strategy='minority')

# First random oversampling, bring count of class 3 to be equal to the highest class count.
X_oversampled, y_oversampled = oversample.fit_resample(X_train, y_train)

# Second random oversampling, bring the count of class 2 to be equal to the other 2 classes
X_oversampled, y_oversampled = oversample.fit_resample(X_oversampled, y_oversampled)



smX, smY = smote.fit_resample(X_train, y_train)



smX.shape, smY.shape
X_oversampled.shape, y_oversampled.shape

((48591, 20), (48591,))

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


from sklearn.decomposition import PCA


# pca = PCA(n_components = 2)
# pca.fit(X_oversampled)
# pca_x=pca.transform(X_oversampled)
# # test = pd.DataFrame(pca.components_)


# X_train2, X_test2, y_train2, y_test2 = train_test_split(pca_x, y_oversampled, test_size=0.2, stratify=y_oversampled, random_state=42)


knn_clf = KNeighborsClassifier(n_neighbors=10)
knn_clf.fit(smX, smY)


knn_predict = knn_clf.predict(X_test)
knn_predict_train = knn_clf.predict(smX)


print("test", accuracy_score(y_test, knn_predict))
print("train", accuracy_score(smY, knn_predict_train))
print(classification_report(y_test, knn_predict))

test 0.494
train 0.7081352513840011
              precision    recall  f1-score   support

         1.0       0.61      0.59      0.60      3541
         2.0       0.71      0.40      0.51      4049
         3.0       0.11      0.61      0.19       410

    accuracy                           0.49      8000
   macro avg       0.48      0.53      0.43      8000
weighted avg       0.63      0.49      0.53      8000

