### Imports

In [1]:
import data
import os
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.base import clone

### Dataset Extraction

In [2]:
doodle_map, X, y = data.auto_load(size=1000, n_files=50)

In [3]:
X.shape

(50000, 784)

In [4]:
y.shape

(50000,)

In [5]:
len(doodle_map)

50

### Split Test Train

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=0)

### Types of Model
#### Logistic Regression

In [7]:
#model = LogisticRegression(multi_class='ovr', solver='lbfgs')
#model.fit(X_train, y_train)

In [8]:
#train_pred = model.predict(X_train)
#print(f'Training Accuracy: {accuracy_score(y_train, train_pred) * 100}%')

In [9]:
#test_pred = model.predict(X_test)
#print(f'Training Accuracy: {accuracy_score(y_test, test_pred) * 100}%')

In [10]:
def learning_curve(model, X_train, X_test, y_train, y_test):
    train_scores = []
    test_scores = []
    
    for i in np.arange(0.1, 1.1, 0.1):
        train_size = int(len(X_train) * i)
        X_train_ = X_train[:train_size]
        y_train_ = y_train[:train_size]
        
        model_ = clone(model)
        model_.fit(X_train_, y_train_)
        train_pred = model_.predict(X_train_)
        test_pred = model_.predict(X_test)
        
        train_acc = accuracy_score(y_train_, train_pred)
        test_acc = accuracy_score(y_test, test_pred)
        
        train_scores.append(train_acc)
        test_scores.append(test_acc)
        
    plt.plot(train_scores, label='Train')
    plt.plot(test_scores, label='Test')
    plt.title('Learning Curve')
    plt.xlabel('Accuracy')
    plt.ylabel('% of Training Size')
    plt.xticks(range(10), (np.arange(0.1, 1.1, 0.1) * 100).astype(int))
    plt.ylim(0.0, 1.01)
    plt.legend()
    
    print(f'Final Training Accuracy: {train_scores[-1] * 100}%')
    print(f'Final Testing Accuracy: {test_scores[-1] * 100}%')
    

In [11]:
#learning_curve(LogisticRegression(multi_class='ovr', solver='lbfgs', penalty='l2'), X_train, X_test, y_train, y_test)

#### Random Forest Classifier

In [12]:
model = RandomForestClassifier(n_estimators=50, n_jobs=-1)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [13]:
train_pred = model.predict(X_train)
print(f'Training Accuracy: {accuracy_score(y_train, train_pred) * 100}%')

Training Accuracy: 99.99142857142857%


In [14]:
test_pred = model.predict(X_test)
print(f'Training Accuracy: {accuracy_score(y_test, test_pred) * 100}%')

Training Accuracy: 48.9%


In [15]:
learning_curve(model, X_train, X_test, y_train, y_test)

KeyboardInterrupt: 

## Support Vector Classifier

In [None]:
#svc = SVC(gamma='auto')
#svc.fit(X_train, y_train)

In [None]:
#train_pred = svc.predict(X_train)
#print(f'Training Accuracy: {accuracy_score(y_train, train_pred) * 100}%')

In [None]:
#test_pred = svc.predict(X_test)
#print(f'Training Accuracy: {accuracy_score(y_test, test_pred) * 100}%')

In [None]:
X_test.shape

In [None]:
y_test.shape