### Introduction
I know most in the competition are using the deep learning approach which does make sense given the size of the data and type of challenge. However I was curious what could be achieved with traditional classifier machine learning models like XGBoost, RandomForest, etc. on the aggregated dataset we made earlier

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn import dummy, metrics, model_selection, preprocessing
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# see the 'EDA & Baseline Model.ipynb for how we constructed this csv'
train_df = pd.read_csv(r"C:\repo\math4920\ASLKaggleProject\asl-signs\extended_train.csv")

In [3]:
train_df.dtypes

path                       object
participant_id              int64
sequence_id                 int64
sign                       object
start_frame                 int64
end_frame                   int64
total_frames                int64
face_count                  int64
face_nan_count              int64
pose_count                  int64
pose_nan_count              int64
left_hand_count             int64
left_hand_nan_count         int64
right_hand_count            int64
right_hand_nan_count        int64
x_min                     float64
x_max                     float64
y_min                     float64
y_max                     float64
z_min                     float64
z_max                     float64
face_appears_pct          float64
face_nan_pct              float64
left_hand_appears_pct     float64
left_hand_nan_pct         float64
pose_appears_pct          float64
pose_nan_pct              float64
right_hand_appears_pct    float64
right_hand_nan_pct        float64
dtype: object

In [4]:
def clean_df(df):
    # for object columns figure out what to do
    df = df.drop(['path', 'participant_id', 'sequence_id'], axis=1)
    df["sign"] = df["sign"].astype("category")
    le = LabelEncoder()
    df['sign'] = le.fit_transform(df['sign'])
    return df


def get_train_test_X_y(df, size=.9):
    """We don't want to impute or standardize on the whole dataset
    else we are 'leaking' data"""
    y = df.sign
    X = df.drop(columns='sign')
    X_train, X_test, y_train, y_test = \
       model_selection.train_test_split(
       X, y, test_size=size, random_state=42,
       stratify=y)
    cols = X.columns
    
    cols = X_train.columns
    std = preprocessing.StandardScaler()
    X_train.loc[:, cols] = std.fit_transform(X_train)
    X_test.loc[:,cols] = std.transform(X_test)
    
    return X_train, X_test, y_train, y_test
    
train_df = clean_df(train_df)
X_train, X_test, y_train, y_test = get_train_test_X_y(train_df)

tX = pd.concat([X_train, X_test])
ty = pd.concat([y_train, y_test])

In [5]:
# baseline model
dc = dummy.DummyClassifier(random_state=42)
dc.fit(X_train, y_train)
dc.score(X_test, y_test)
# oof not so good
# but lots of room for improvement!

0.004339644831236034

In [6]:
y_train.value_counts()

60     41
148    41
136    41
135    41
194    41
       ..
21     31
56     31
231    31
170    31
249    30
Name: sign, Length: 250, dtype: int64

In [7]:
# Try different families of models to see if one is particularly adept to this problem
# Warning: This cell takes awhile to run
out = []
for model in [xgb.XGBClassifier, DecisionTreeClassifier,
              KNeighborsClassifier, GaussianNB, SGDClassifier,
              RandomForestClassifier]:
    cls = model()
    kfold = model_selection.KFold(n_splits=3)
    res = model_selection.cross_val_score(cls, tX, ty,
                scoring='accuracy', cv=kfold)
    out.append(f'{cls.__class__.__name__:23}  Accuracy: {res.mean():.3f}  STD: {res.std():.2f}')
for t in out:
    print(t)
# yikes it appears this is just not the approach for this competition.
# still we will go a bit further with the best model here (RandomForest) to see what we can get

XGBClassifier            Accuracy: 0.019  STD: 0.00
DecisionTreeClassifier   Accuracy: 0.012  STD: 0.00
KNeighborsClassifier     Accuracy: 0.014  STD: 0.00
GaussianNB               Accuracy: 0.005  STD: 0.00
SGDClassifier            Accuracy: 0.005  STD: 0.00
RandomForestClassifier   Accuracy: 0.019  STD: 0.00


In [13]:
# This cell will also take awhile to run

pg = {'random_state': [42],
      'min_samples_split': [3, 10, 30], 
      'n_estimators' : [100, 300, 1000],
      'max_depth': [None, 2, 5, 10],
      'max_features': [3, 5, 10, 20]
     }
clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X_train, y_train)
gs = model_selection.RandomizedSearchCV(clf, param_distributions=pg, scoring='accuracy',
                                  n_jobs=-1, 
                                  cv=3)
gs.fit(X_train, y_train)

print("PARAMS", gs.best_params_)
print("ORIG", clf.score(X_test, y_test), "NEW", gs.score(X_test, y_test))

PARAMS {'random_state': 42, 'n_estimators': 1000, 'min_samples_split': 3, 'max_features': 3, 'max_depth': 10}
ORIG 0.013689286134305539 NEW 0.014512524991179583


In [14]:
# this is the result from a grid search run that took much longer and still the performance is abysmal
clf = RandomForestClassifier(n_jobs=-1,max_depth=None, max_features=20, min_samples_split=30, n_estimators=1000, random_state=42)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.014888862754322003

It looks like there is just too much information loss in this approach and the models can't distinguish between the signs. 