In [56]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [57]:
cuisines_df = pd.read_csv("A:/Documents/School/Project401/ML-For-Beginners/4-Classification/data/cuisines.csv")
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,65,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,66,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,67,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,68,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,69,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [58]:
cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [59]:
cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [60]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)



### AdaBoost

In [61]:
C = 10
# Create different classifiers.
classifiers = {
    'ADA': AdaBoostClassifier(n_estimators=100)
    
}
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for ADA: 64.1% 
              precision    recall  f1-score   support

     chinese       0.60      0.40      0.48       152
      indian       0.84      0.88      0.86       168
    japanese       0.36      0.54      0.43       101
      korean       0.72      0.72      0.72       223
        thai       0.58      0.52      0.55        91

    accuracy                           0.64       735
   macro avg       0.62      0.61      0.61       735
weighted avg       0.66      0.64      0.64       735



### Tweaking C

In [62]:
C = 100
# Create different classifiers.
classifiers = {
    'ADA': AdaBoostClassifier(n_estimators=100)
    
}
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for ADA: 64.1% 
              precision    recall  f1-score   support

     chinese       0.60      0.40      0.48       152
      indian       0.84      0.88      0.86       168
    japanese       0.36      0.54      0.43       101
      korean       0.72      0.72      0.72       223
        thai       0.58      0.52      0.55        91

    accuracy                           0.64       735
   macro avg       0.62      0.61      0.61       735
weighted avg       0.66      0.64      0.64       735



Increasing C does not result in a changing result (not better, not worse)

### Tweaking n_estimators
n_estimators are the weak learners that are used in training. If the amount is changed to more, the accuracy should increase as well, however this is not true as there is a tradeoff because the learning rate is also affected by this.

In [63]:
classifiers = {
    'ADA1': AdaBoostClassifier(n_estimators=20),
    'ADA2': AdaBoostClassifier(n_estimators=1000)
    
}
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for ADA1: 70.7% 
              precision    recall  f1-score   support

     chinese       0.65      0.48      0.55       152
      indian       0.89      0.87      0.88       168
    japanese       0.68      0.32      0.43       101
      korean       0.62      0.91      0.74       223
        thai       0.81      0.71      0.76        91

    accuracy                           0.71       735
   macro avg       0.73      0.66      0.67       735
weighted avg       0.72      0.71      0.69       735

Accuracy (train) for ADA2: 63.9% 
              precision    recall  f1-score   support

     chinese       0.57      0.36      0.44       152
      indian       0.79      0.83      0.81       168
    japanese       0.45      0.64      0.53       101
      korean       0.81      0.69      0.74       223
        thai       0.46      0.65      0.54        91

    accuracy                           0.64       735
   macro avg       0.61      0.63      0.61       735
weighted 

Changing n_estimators to a higher number makes the accuracy worse

### Learning rate

In [66]:
classifiers = {
    'ADA1': AdaBoostClassifier(n_estimators=100, learning_rate=1),
    'ADA2': AdaBoostClassifier(n_estimators=100, learning_rate=0.1)
    
}
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for ADA1: 64.1% 
              precision    recall  f1-score   support

     chinese       0.60      0.40      0.48       152
      indian       0.84      0.88      0.86       168
    japanese       0.36      0.54      0.43       101
      korean       0.72      0.72      0.72       223
        thai       0.58      0.52      0.55        91

    accuracy                           0.64       735
   macro avg       0.62      0.61      0.61       735
weighted avg       0.66      0.64      0.64       735

Accuracy (train) for ADA2: 70.2% 
              precision    recall  f1-score   support

     chinese       0.65      0.57      0.61       152
      indian       0.90      0.88      0.89       168
    japanese       0.82      0.18      0.29       101
      korean       0.60      0.96      0.74       223
        thai       0.88      0.54      0.67        91

    accuracy                           0.70       735
   macro avg       0.77      0.63      0.64       735
weighted 

Decreasing the learning rate to (in this example 0.1) makes the accuracy higher