# Many small or one big

##### Loading Data

In [2]:
from sign.CONST import DATA_BASE_PATH
import os
import csv

from sign.training.load_data.StaticLandmarkLoader import StaticLandmarkLoader

DATA_PATH = ".." + os.sep + "backend" + os.sep + "model" + os.sep + "keypoints_from_data.csv"

loader = StaticLandmarkLoader()
train_data = loader.load_training_data(DATA_PATH)
print("train set size: ", len(train_data.labels_train), 
      " - train set size: ", len(train_data.labels_test))

train set size:  39844  - train set size:  9961


#### Train a binary classifier, a or not a

In [22]:
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline, make_pipeline
def train_hotdog_or_not_hotdog_as_SDG(train_data, target_labels):
    classifier = SGDClassifier(random_state=42)
    classifier.fit(train_data, target_labels)
    return classifier

def train_hotdog_or_not_hotdog_as_LinearSVM(train_data, target_labels, scaler="standard") -> Pipeline:
    if scaler == "standard":
        classifier = make_pipeline(StandardScaler(),
                                   LinearSVC(C = 5, random_state=42, max_iter=10_000))
    elif scaler == "polynomial":
        classifier = make_pipeline(PolynomialFeatures(),
                                   LinearSVC(C = 5, random_state=42, max_iter=10_000))
    else:
        raise Exception("Nah bro, that scaler doesn't exist")
    classifier.fit(train_data, target_labels)
    return classifier

def train_hotdog_or_not_hotdog_as_SVC(train_data, target_labels,
                                      degree=3, C = 5) -> Pipeline:
    classifier = make_pipeline(StandardScaler(),
                               SVC(kernel="poly", degree=degree, coef0=1, C=C))
    classifier.fit(train_data, target_labels)
    return classifier

def print_classification_report_hotdog(model, target_labels):
    predictions = model.predict(train_data.landmarks_test)
    print(classification_report(target_labels, predictions))
    print(confusion_matrix(target_labels, predictions))


In [4]:
train_a_labels = (train_data.labels_train == 'A')
test_a_labels = (train_data.labels_test == 'A')

sgd_a_or_not_a = train_hotdog_or_not_hotdog_as_SDG(train_data.landmarks_train,
                                             train_a_labels)
print_classification_report_hotdog(sgd_a_or_not_a, test_a_labels)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00      9611
        True       0.95      0.99      0.97       350

    accuracy                           1.00      9961
   macro avg       0.97      0.99      0.98      9961
weighted avg       1.00      1.00      1.00      9961

[[9592   19]
 [   3  347]]


##### U or not U

In [5]:
train_u_labels = (train_data.labels_train == 'U')
test_u_labels = (train_data.labels_test == 'U')

u_or_not = train_hotdog_or_not_hotdog_as_SDG(train_data.landmarks_train, train_u_labels)
print_classification_report_hotdog(u_or_not, test_u_labels)

              precision    recall  f1-score   support

       False       0.96      1.00      0.98      9534
        True       0.75      0.01      0.01       427

    accuracy                           0.96      9961
   macro avg       0.85      0.50      0.50      9961
weighted avg       0.95      0.96      0.94      9961

[[9533    1]
 [ 424    3]]


In [20]:
#Scales the featuers using polynomialscaler, SLOW
u_or_not_linear_svm = train_hotdog_or_not_hotdog_as_LinearSVM(train_data.landmarks_train, 
                                                       train_u_labels,
                                                       scaler="polynomial")
print_classification_report_hotdog(u_or_not_linear_svm, test_u_labels)




              precision    recall  f1-score   support

       False       1.00      1.00      1.00      9534
        True       0.97      0.97      0.97       427

    accuracy                           1.00      9961
   macro avg       0.99      0.99      0.99      9961
weighted avg       1.00      1.00      1.00      9961

[[9522   12]
 [  12  415]]


In [28]:
#Utilizes the *kernel trick*
u_or_not_svc = train_hotdog_or_not_hotdog_as_SVC(train_data.landmarks_train,
                                                 train_u_labels, degree = 10)
print_classification_report_hotdog(u_or_not_svc, test_u_labels)

              precision    recall  f1-score   support

       False       1.00      1.00      1.00      9534
        True       0.98      0.99      0.99       427

    accuracy                           1.00      9961
   macro avg       0.99      1.00      0.99      9961
weighted avg       1.00      1.00      1.00      9961

[[9525    9]
 [   3  424]]


##### SVC multiclass

In [31]:
svc_alph_clf = train_hotdog_or_not_hotdog_as_SVC(train_data.landmarks_train,
                                                 train_data.labels_train,
                                                 degree=10)
svc_predictions = svc_alph_clf.predict(train_data.landmarks_test)
cr = classification_report(train_data.labels_test, svc_predictions)
print(cr)

              precision    recall  f1-score   support

           A       1.00      1.00      1.00       350
           B       1.00      1.00      1.00       364
           C       1.00      1.00      1.00       306
           D       0.99      1.00      1.00       366
           E       0.99      1.00      0.99       365
           F       1.00      1.00      1.00       487
           G       1.00      1.00      1.00       360
           H       1.00      0.99      1.00       389
           I       0.99      0.99      0.99       354
           J       1.00      1.00      1.00       405
           K       1.00      1.00      1.00       437
           L       1.00      1.00      1.00       386
           M       0.98      0.94      0.96       206
           N       0.91      0.96      0.93       157
           O       1.00      0.99      1.00       373
           P       0.99      1.00      1.00       333
           Q       0.99      0.99      0.99       319
           R       0.99    

#### Softmax Multiclass - big boi

In [9]:
from sklearn.linear_model import LogisticRegression
softmax_reg = LogisticRegression(random_state=42, max_iter = 10_000)
softmax_reg.fit(train_data.landmarks_train, train_data.labels_train)

In [10]:
predictions = softmax_reg.predict(train_data.landmarks_test)
cr = classification_report(train_data.labels_test,predictions, target_names=softmax_reg.classes_)
print(cr)

              precision    recall  f1-score   support

           A       0.98      0.99      0.99       350
           B       0.98      1.00      0.99       364
           C       1.00      0.99      1.00       306
           D       0.99      0.99      0.99       366
           E       0.99      1.00      0.99       365
           F       0.99      1.00      0.99       487
           G       0.99      0.99      0.99       360
           H       0.99      0.99      0.99       389
           I       0.97      0.97      0.97       354
           J       0.98      0.98      0.98       405
           K       0.99      0.99      0.99       437
           L       1.00      0.99      1.00       386
           M       0.96      0.93      0.95       206
           N       0.97      0.92      0.94       157
           O       0.98      1.00      0.99       373
           P       0.97      0.95      0.96       333
           Q       0.94      0.98      0.96       319
           R       0.98    

##### Softmax Multiclass - finding optimal hyperparams

Iteratively try looking for better hyperparameters.
You can set the "scoring" parameter of cross_val_score to any of [these - sklearn docs](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)

In [11]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report

def __find_helper(C: float,best, max_iter: int, cv:int, threshold: float, step:int):
    for _ in range(max_iter):
        softmax_reg = LogisticRegression(random_state=42, max_iter = 10_000, C = C)
        softmax_reg.fit(train_data.landmarks_train, train_data.labels_train)
        score = cross_val_score(softmax_reg, 
                        train_data.landmarks_train, 
                        train_data.labels_train,
                        cv=cv,
                        scoring="f1_weighted")
        
        score_mean = score.mean()
        if best is None or score_mean > best:
            #make sure it really is THE best B)
            y_pred = cross_val_predict(softmax_reg, 
                                    train_data.landmarks_train, 
                                    train_data.labels_train,
                                    cv=cv)
            report = classification_report(train_data.labels_train, 
                                            y_pred,
                                            output_dict=True)
            if not isinstance(report, dict):
                print("Sklearn returned a report in string format??")
                continue

            for key,value in report.items():
                if key == "accuracy":
                    continue
                key_score = value['f1-score']
                if(key_score < threshold):
                    print(f"Tried C={C} but key='{key}' was less than threshold f1:{key_score}")
        
            best = score_mean
            bestC= C
            print("Updated best C=", bestC, " with score=", best)
        C = C + step
        print(f"new C {C}")
    return bestC, best


def find_softmax_optimal_C(C = 1, 
                           max_iter = 10, 
                           cv = 3, 
                           step = 5, 
                           threshold = 0.9, 
                           find_perfect = False) -> float:
    bestC, best = __find_helper(C, None, max_iter, cv, threshold, step)
    if find_perfect and best is not None:
        bestC, best = __find_helper(bestC-step, best, (step-1)*2, cv, threshold, 1)

    return bestC
    
