In [2]:
#importing packages
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
import time

In [3]:
#importing data
with open('data_full.json') as file:
    oos = json.load(file)

In [4]:
#exploring files
oos.keys()

dict_keys(['oos_val', 'val', 'train', 'oos_test', 'test', 'oos_train'])

In [5]:
#extracting out of scope validation set
temp = oos['oos_val']
oos_val = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})

In [6]:
#extracting validation set
temp = oos['val']
val = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})

In [7]:
#extracting training set
temp = oos['train']
train = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})

In [8]:
#extracting out of scope test set
temp = oos['oos_test']
oos_test = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})

In [9]:
#extracting test set
temp = oos['test']
test = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})

In [10]:
#extracting out of scope training set
temp = oos['oos_train']
oos_train = pd.DataFrame(temp).rename(columns={0:"query", 1:"domain"})

In [11]:
#checking first few rows of each set
print(oos_val.head())
print(val.head())
print(train.head())
print(oos_test.head())
print(test.head())
print(oos_train.head())


                                               query domain
1                                 a show on broadway    oos
2                 who has the best record in the nfl    oos
3                 how do i find the area of a circle    oos
4                  how many onions do i have on hand    oos
                                       query     domain
0   in spanish, meet me tomorrow is said how  translate
1     in french, how do i say, see you later  translate
2           how do you say hello in japanese  translate
3  how do i ask about the weather in chinese  translate
4  how can i say "cancel my order" in french  translate
                                               query     domain
0  what expression would i use to say i love you ...  translate
1  can you tell me how to say 'i do not speak muc...  translate
2  what is the equivalent of, 'life is good' in f...  translate
3  tell me how to say, 'it is a beautiful morning...  translate
4  if i were mongolian, how would i say that

In [12]:
#combining in scope and out of scope training data into one dataset for our training protocol
train = train.append(oos_train, ignore_index=True)

In [13]:
#defining our encoding vectorizer
vectorizer = TfidfVectorizer()

In [14]:
#fitting and transforming the vectorizer to the training data queries
X = vectorizer.fit_transform(train['query']).toarray()

In [15]:
#checking the dimensions of the training vectorised dataset
X.shape

(15100, 5146)

In [16]:
#having a look at the first few features
print(vectorizer.get_feature_names()[:5])

['00', '000', '005', '00am', '00pm']


In [17]:
#changing the array to a dataframe
X_df = pd.DataFrame(X)

In [18]:
#checking the shape of the vector dataframe
X_df.shape

(15100, 5146)

In [19]:
#checking the shape of the unprocessed training data
train.shape

(15100, 2)

In [20]:
#joining the vectorized dataframe to the training data
train_vec = train.join(X_df)

In [21]:
#checking the dimensions of the combined dataframe
train_vec.shape

(15100, 5148)

In [22]:
#inspecting the first few rows of the dataframe
train_vec.head()

Unnamed: 0,query,domain,0,1,2,3,4,5,6,7,...,5136,5137,5138,5139,5140,5141,5142,5143,5144,5145
0,what expression would i use to say i love you ...,translate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,can you tell me how to say 'i do not speak muc...,translate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"what is the equivalent of, 'life is good' in f...",translate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"tell me how to say, 'it is a beautiful morning...",translate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"if i were mongolian, how would i say that i am...",translate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
#removing the text query from the dataframe
train_vec = train_vec.drop('query', axis=1)

In [24]:
#checking the values of a randomly chosen column to check contents
train_vec[5136].value_counts().head(10)

0.000000    15095
0.744097        1
0.731775        1
0.694293        1
0.577886        1
0.698728        1
Name: 5136, dtype: int64

In [25]:
#importing further packages
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [26]:
#checking all of the possible domains that exist within the dataset
train_vec['domain'].unique()

array(['translate', 'transfer', 'timer', 'definition', 'meaning_of_life',
       'insurance_change', 'find_phone', 'travel_alert', 'pto_request',
       'improve_credit_score', 'fun_fact', 'change_language', 'payday',
       'replacement_card_duration', 'time', 'application_status',
       'flight_status', 'flip_coin', 'change_user_name',
       'where_are_you_from', 'shopping_list_update', 'what_can_i_ask_you',
       'maybe', 'oil_change_how', 'restaurant_reservation', 'balance',
       'confirm_reservation', 'freeze_account', 'rollover_401k',
       'who_made_you', 'distance', 'user_name', 'timezone', 'next_song',
       'transactions', 'restaurant_suggestion', 'rewards_balance',
       'pay_bill', 'spending_history', 'pto_request_status',
       'credit_score', 'new_card', 'lost_luggage', 'repeat', 'mpg',
       'oil_change_when', 'yes', 'travel_suggestion', 'insurance',
       'todo_list_update', 'reminder', 'change_speed', 'tire_pressure',
       'no', 'apr', 'nutrition_info', 'c

In [27]:
#creating a mapping dictionary to allow for numeric rather than descriptive classes
y_dic = {}
domain = 0
for item in train_vec['domain'].unique():
    y_dic[item] = domain
    domain += 1

In [28]:
#shuffling the training data so the classes are not in order
train_vec = shuffle(train_vec, random_state=0)
train_vec.head()

Unnamed: 0,domain,0,1,2,3,4,5,6,7,8,...,5136,5137,5138,5139,5140,5141,5142,5143,5144,5145
8218,do_you_have_pets,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8136,routing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
577,insurance_change,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7446,todo_list,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3978,pto_request_status,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
#creating a numeric target column for training data using the mapping dictionary, defining the input data and targets
train_vec['y'] = train_vec['domain'].replace(y_dic)
train_x = train_vec.iloc[:, 1:-1]
train_y = train_vec['y']

In [30]:
#fitting the vectorizer to the validation datasets
Xv = vectorizer.transform(val['query']).toarray()
Xvo = vectorizer.transform(oos_val['query']).toarray()

In [31]:
#creating dataframes from the arrays of the vectorized validation data
Xv_df = pd.DataFrame(Xv)
Xvo_df = pd.DataFrame(Xvo)

In [32]:
#joining the validation vectors to the initial data, dropping the text query, shuffling, mapping the intents to numeric classes, and defining inputs and labels
val_vec = pd.concat([val, Xv_df], axis=1)
val_vec = val_vec.drop('query', axis=1)
val_vec = shuffle(val_vec, random_state=0)
val_vec['y'] = val_vec['domain'].replace(y_dic)
val_x = val_vec.iloc[:, 1:-1]
val_y = val_vec['y']

In [33]:
#joining the out of scope validation vectors to the initial data, dropping the text query, shuffling, mapping the intents to numeric classes, and defining inputs and targets
val_oos_vec = pd.concat([oos_val, Xvo_df], axis=1)
val_oos_vec = val_oos_vec.drop('query', axis=1)
val_oos_vec = shuffle(val_oos_vec, random_state=0)
val_oos_vec['y'] = val_oos_vec['domain'].replace(y_dic)
val_oos_x = val_oos_vec.iloc[:, 1:-1]
val_oos_y = val_oos_vec['y']

Initial work will investigate the most effective kernel to use, before optimizing other parameters.

In [34]:
print('LinearSVC')
t0 = time.time() # timer
clf = LinearSVC(random_state=0, tol=0.1) #LinearSVC - one vs rest SVC with linear kernel; random state 0 for reproducibility, loose tolerance for faster fitting
clf.fit(train_x, train_y) #fitting classifier to training data
t1 = time.time() #timer
labels = clf.predict(train_x) #re-predicting training data
train_score = accuracy_score(train_y, labels) #computing classification accuracy score for training data
t2 = time.time() #timer
val_lab = clf.predict(val_x) #predicting validation data
t3 = time.time() #timer
val_score = accuracy_score(val_y, val_lab) #calculating classification accuracy score for validation data
t4 = time.time() #timer
val_oos_lab = clf.predict(val_oos_x) #predicting out of scope validation data
t5 = time.time() #timer
oos_val_score = accuracy_score(val_oos_y, val_oos_lab) #calculating classification accuracy score for OOS val data
print("Training Accuracy")
print(train_score)
print("Validation Accuracy (in scope)")
print(val_score)
print("Validation Accuracy (oos)")
print(oos_val_score)
print("Train Time")
print(t1-t0) #training time
print("Predict Time")
print(t3-t2) #validation prediction time
print("OOS Predict Time")
print(t5-t4) #out of scope validation prediction time

LinearSVC
Training Accuracy
0.9963576158940397
Validation Accuracy (in scope)
0.9086666666666666
Validation Accuracy (oos)
0.23
Train Time
2.703575849533081
Predict Time
0.04864311218261719
OOS Predict Time
0.02647995948791504


In [38]:
kernel_list = ['linear', 'rbf', 'poly', 'sigmoid'] #possible kernels to try
for kernel in kernel_list: #looping through kernels
    print(kernel)
    t0 = time.time()
    clf = SVC(kernel=kernel, random_state=0, tol=0.1) #changing kernel
    clf.fit(train_x, train_y)
    t1 = time.time()
    labels = clf.predict(train_x)
    train_score = accuracy_score(train_y, labels)
    t2 = time.time()
    val_lab = clf.predict(val_x)
    t3 = time.time()
    val_score = accuracy_score(val_y, val_lab)
    t4 = time.time()
    val_oos_lab = clf.predict(val_oos_x)
    t5 = time.time()
    oos_val_score = accuracy_score(val_oos_y, val_oos_lab)
    print("Training Accuracy")
    print(train_score) 
    print("Validation Accuracy (in scope)")
    print(val_score)
    print("Validation Accuracy (oos)")
    print(oos_val_score)
    print("Train Time")
    print(t1-t0)
    print("Predict Time")
    print(t3-t2)
    print("OOS Predict Time")
    print(t5-t4)

linear
Training Accuracy
0.9916556291390729
Validation Accuracy (in scope)
0.908
Validation Accuracy (oos)
0.45
Train Time
851.4122288227081
Predict Time
218.7043559551239
OOS Predict Time
7.691932916641235
rbf
Training Accuracy
0.9968211920529801
Validation Accuracy (in scope)
0.8943333333333333
Validation Accuracy (oos)
0.64
Train Time
1329.334305047989
Predict Time
235.85941195487976
OOS Predict Time
7.808900833129883
poly
Training Accuracy
0.9979470198675496
Validation Accuracy (in scope)
0.7813333333333333
Validation Accuracy (oos)
0.78
Train Time
1761.1285581588745
Predict Time
1319.7652490139008
OOS Predict Time
33.740272998809814
sigmoid
Training Accuracy
0.9802649006622517
Validation Accuracy (in scope)
0.903
Validation Accuracy (oos)
0.45
Train Time
10707.45944595337
Predict Time
3652.0696852207184
OOS Predict Time
312.530179977417


Linear SVC (one-vs-rest) is several orders of magnitude faster than any other basis function. Of the others, polynomial and radial basis function are much less accurate. Sigmoid and linear (one-vs-one) are comparable, but much more computationally intensive. Sigmoid is ~10x slower than linear (one-vs-one) and slightly less accurate. Both of the linear models will be taken forward: one-vs-one is slightly more accurate, but much more costly. While the differences in complexity are inherent, differences in accuracy may increase with tuning.

In [None]:
regularization = [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4] #attempting various values for c
for c in regularization:
    print('LinearSVC - Hinge Squared')
    print(c)
    t0 = time.time()
    clf = LinearSVC(random_state=0, tol=0.1, C=c, max_iter=1000000) #one-vs-rest SVM with hinge-squared loss
    clf.fit(train_x, train_y)
    t1 = time.time()
    labels = clf.predict(train_x)
    train_score = accuracy_score(train_y, labels)
    t2 = time.time()
    val_lab = clf.predict(val_x)
    t3 = time.time()
    val_score = accuracy_score(val_y, val_lab)
    t4 = time.time()
    val_oos_lab = clf.predict(val_oos_x)
    t5 = time.time()
    oos_val_score = accuracy_score(val_oos_y, val_oos_lab)
    print("Training Accuracy")
    print(train_score) 
    print("Validation Accuracy (in scope)")
    print(val_score)
    print("Validation Accuracy (oos)")
    print(oos_val_score)
    print("Validation Accuracy (TOTAL)")
    print(((150*val_score) + oos_val_score)/151) #calculating overall validation accuracy
    print("Train Time")
    print(t1-t0)
    print("Predict Time")
    print(t3-t2)
    print("OOS Predict Time")
    print(t5-t4)
    
regularization = [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]
for c in regularization:    
    print('LinearSVC - Hinge')
    print(c)
    t0 = time.time()
    clf = LinearSVC(random_state=0, tol=0.1, C=c, loss='hinge', max_iter=1000000) #as above but with hinge loss
    clf.fit(train_x, train_y)
    t1 = time.time()
    labels = clf.predict(train_x)
    train_score = accuracy_score(train_y, labels)
    t2 = time.time()
    val_lab = clf.predict(val_x)
    t3 = time.time()
    val_score = accuracy_score(val_y, val_lab)
    t4 = time.time()
    val_oos_lab = clf.predict(val_oos_x)
    t5 = time.time()
    oos_val_score = accuracy_score(val_oos_y, val_oos_lab)
    print("Training Accuracy")
    print(train_score) 
    print("Validation Accuracy (in scope)")
    print(val_score)
    print("Validation Accuracy (oos)")
    print(oos_val_score)
    print("Validation Accuracy (TOTAL)")
    print(((150*val_score) + oos_val_score)/151)
    print("Train Time")
    print(t1-t0)
    print("Predict Time")
    print(t3-t2)
    print("OOS Predict Time")
    print(t5-t4)
    
regularization = [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]
for c in regularization:
    print('SVC - Linear Kernel')
    print(c)
    t0 = time.time()
    clf = SVC(kernel='linear', random_state=0, tol=0.1, C=c, max_iter=1000000) #one-vs-one
    clf.fit(train_x, train_y)
    t1 = time.time()
    labels = clf.predict(train_x)
    train_score = accuracy_score(train_y, labels)
    t2 = time.time()
    val_lab = clf.predict(val_x)
    t3 = time.time()
    val_score = accuracy_score(val_y, val_lab)
    t4 = time.time()
    val_oos_lab = clf.predict(val_oos_x)
    t5 = time.time()
    oos_val_score = accuracy_score(val_oos_y, val_oos_lab)
    print("Training Accuracy")
    print(train_score) 
    print("Validation Accuracy (in scope)")
    print(val_score)
    print("Validation Accuracy (oos)")
    print(oos_val_score)
    print("Validation Accuracy (TOTAL)")
    print(((150*val_score) + oos_val_score)/151)
    print("Train Time")
    print(t1-t0)
    print("Predict Time")
    print(t3-t2)
    print("OOS Predict Time")
    print(t5-t4)

LinearSVC - Hinge Squared
0.0001
Training Accuracy
0.8449006622516556
Validation Accuracy (in scope)
0.7753333333333333
Validation Accuracy (oos)
0.0
Validation Accuracy (TOTAL)
0.7701986754966887
Train Time
2.504005193710327
Predict Time
0.056551218032836914
OOS Predict Time
0.03090500831604004
LinearSVC - Hinge Squared
0.001
Training Accuracy
0.8505298013245033
Validation Accuracy (in scope)
0.786
Validation Accuracy (oos)
0.01
Validation Accuracy (TOTAL)
0.7808609271523179
Train Time
2.169395923614502
Predict Time
0.05410599708557129
OOS Predict Time
0.02574324607849121
LinearSVC - Hinge Squared
0.01
Training Accuracy
0.888476821192053
Validation Accuracy (in scope)
0.8153333333333334
Validation Accuracy (oos)
0.02
Validation Accuracy (TOTAL)
0.8100662251655628
Train Time
2.3255069255828857
Predict Time
0.05310511589050293
OOS Predict Time
0.02373480796813965
LinearSVC - Hinge Squared
0.1
Training Accuracy
0.9673509933774834
Validation Accuracy (in scope)
0.8806666666666667
Validati

NB results above stopped due to failed kernel, resumed below.

In [43]:
regularization = [1e-1, 1, 1e1, 1e2, 1e3, 1e4]
for c in regularization:
    print('SVC - Linear Kernel')
    print(c)
    t0 = time.time()
    clf = SVC(kernel='linear', random_state=0, tol=0.1, C=c, max_iter=1000000)
    clf.fit(train_x, train_y)
    t1 = time.time()
    labels = clf.predict(train_x)
    train_score = accuracy_score(train_y, labels)
    t2 = time.time()
    val_lab = clf.predict(val_x)
    t3 = time.time()
    val_score = accuracy_score(val_y, val_lab)
    t4 = time.time()
    val_oos_lab = clf.predict(val_oos_x)
    t5 = time.time()
    oos_val_score = accuracy_score(val_oos_y, val_oos_lab)
    print("Training Accuracy")
    print(train_score) 
    print("Validation Accuracy (in scope)")
    print(val_score)
    print("Validation Accuracy (oos)")
    print(oos_val_score)
    print("Validation Accuracy (TOTAL)")
    print(((150*val_score) + oos_val_score)/151)
    print("Train Time")
    print(t1-t0)
    print("Predict Time")
    print(t3-t2)
    print("OOS Predict Time")
    print(t5-t4)

SVC - Linear Kernel
0.1
Training Accuracy
0.8600662251655629
Validation Accuracy (in scope)
0.779
Validation Accuracy (oos)
0.79
Validation Accuracy (TOTAL)
0.7790728476821193
Train Time
1458.1589348316193
Predict Time
269.27534580230713
OOS Predict Time
9.01478886604309
SVC - Linear Kernel
1
Training Accuracy
0.9916556291390729
Validation Accuracy (in scope)
0.908
Validation Accuracy (oos)
0.45
Validation Accuracy (TOTAL)
0.9049668874172185
Train Time
743.2696630954742
Predict Time
212.80302906036377
OOS Predict Time
7.194847106933594
SVC - Linear Kernel
10.0
Training Accuracy
0.9988741721854305
Validation Accuracy (in scope)
0.914
Validation Accuracy (oos)
0.45
Validation Accuracy (TOTAL)
0.9109271523178807
Train Time
763.4368917942047
Predict Time
213.50160217285156
OOS Predict Time
7.155864000320435
SVC - Linear Kernel
100.0
Training Accuracy
0.9999337748344371
Validation Accuracy (in scope)
0.9086666666666666
Validation Accuracy (oos)
0.45
Validation Accuracy (TOTAL)
0.90562913907

One-vs-one best accuracy on validation so far (91.4% IS, 45% OOS)with C of 10, with one-vs-rest with hinge-squared slightly behind (90.9 IS%, 23% OOS) with C of 1. Fine-tuning for C by binary search:

In [44]:
regularization = [5, 15]
for c in regularization:
    print('SVC - Linear Kernel')
    print(c)
    t0 = time.time()
    clf = SVC(kernel='linear', random_state=0, tol=0.1, C=c, max_iter=1000000)
    clf.fit(train_x, train_y)
    t1 = time.time()
    labels = clf.predict(train_x)
    train_score = accuracy_score(train_y, labels)
    t2 = time.time()
    val_lab = clf.predict(val_x)
    t3 = time.time()
    val_score = accuracy_score(val_y, val_lab)
    t4 = time.time()
    val_oos_lab = clf.predict(val_oos_x)
    t5 = time.time()
    oos_val_score = accuracy_score(val_oos_y, val_oos_lab)
    print("Training Accuracy")
    print(train_score) 
    print("Validation Accuracy (in scope)")
    print(val_score)
    print("Validation Accuracy (oos)")
    print(oos_val_score)
    print("Validation Accuracy (TOTAL)")
    print(((150*val_score) + oos_val_score)/151)
    print("Train Time")
    print(t1-t0)
    print("Predict Time")
    print(t3-t2)
    print("OOS Predict Time")
    print(t5-t4)

SVC - Linear Kernel
5
Training Accuracy
0.9986754966887417
Validation Accuracy (in scope)
0.915
Validation Accuracy (oos)
0.45
Validation Accuracy (TOTAL)
0.9119205298013244
Train Time
4182.7618770599365
Predict Time
4175.032507181168
OOS Predict Time
7.579639911651611
SVC - Linear Kernel
15
Training Accuracy
0.9991390728476821
Validation Accuracy (in scope)
0.9123333333333333
Validation Accuracy (oos)
0.45
Validation Accuracy (TOTAL)
0.9092715231788079
Train Time
6624.609422206879
Predict Time
232.22509741783142
OOS Predict Time
8.61223578453064


Slightly better for C=5 for one-v-one

In [45]:
regularization = [2.5, 7.5]
for c in regularization:
    print('SVC - Linear Kernel')
    print(c)
    t0 = time.time()
    clf = SVC(kernel='linear', random_state=0, tol=0.1, C=c, max_iter=1000000)
    clf.fit(train_x, train_y)
    t1 = time.time()
    labels = clf.predict(train_x)
    train_score = accuracy_score(train_y, labels)
    t2 = time.time()
    val_lab = clf.predict(val_x)
    t3 = time.time()
    val_score = accuracy_score(val_y, val_lab)
    t4 = time.time()
    val_oos_lab = clf.predict(val_oos_x)
    t5 = time.time()
    oos_val_score = accuracy_score(val_oos_y, val_oos_lab)
    print("Training Accuracy")
    print(train_score) 
    print("Validation Accuracy (in scope)")
    print(val_score)
    print("Validation Accuracy (oos)")
    print(oos_val_score)
    print("Validation Accuracy (TOTAL)")
    print(((150*val_score) + oos_val_score)/151)
    print("Train Time")
    print(t1-t0)
    print("Predict Time")
    print(t3-t2)
    print("OOS Predict Time")
    print(t5-t4)

SVC - Linear Kernel
2.5
Training Accuracy
0.9970198675496689
Validation Accuracy (in scope)
0.9143333333333333
Validation Accuracy (oos)
0.45
Validation Accuracy (TOTAL)
0.9112582781456954
Train Time
916.6407608985901
Predict Time
221.45925879478455
OOS Predict Time
7.223144769668579
SVC - Linear Kernel
7.5
Training Accuracy
0.9987417218543047
Validation Accuracy (in scope)
0.9136666666666666
Validation Accuracy (oos)
0.45
Validation Accuracy (TOTAL)
0.910596026490066
Train Time
805.1982262134552
Predict Time
229.99253869056702
OOS Predict Time
7.201364994049072


Both worse. Best values for one-vs-one found for C=5: 91.5% IS, 45% OOS.

Tuning one-vs-rest

In [47]:
regularization = [0.5, 5]
for c in regularization:
    print('LinearSVC - Hinge Squared')
    print(c)
    t0 = time.time()
    clf = LinearSVC(random_state=0, tol=0.1, C=c, max_iter=1000000)
    clf.fit(train_x, train_y)
    t1 = time.time()
    labels = clf.predict(train_x)
    train_score = accuracy_score(train_y, labels)
    t2 = time.time()
    val_lab = clf.predict(val_x)
    t3 = time.time()
    val_score = accuracy_score(val_y, val_lab)
    t4 = time.time()
    val_oos_lab = clf.predict(val_oos_x)
    t5 = time.time()
    oos_val_score = accuracy_score(val_oos_y, val_oos_lab)
    print("Training Accuracy")
    print(train_score) 
    print("Validation Accuracy (in scope)")
    print(val_score)
    print("Validation Accuracy (oos)")
    print(oos_val_score)
    print("Validation Accuracy (TOTAL)")
    print(((150*val_score) + oos_val_score)/151)
    print("Train Time")
    print(t1-t0)
    print("Predict Time")
    print(t3-t2)
    print("OOS Predict Time")
    print(t5-t4)

LinearSVC - Hinge Squared
0.5
Training Accuracy
0.9925827814569537
Validation Accuracy (in scope)
0.9026666666666666
Validation Accuracy (oos)
0.19
Validation Accuracy (TOTAL)
0.8979470198675497
Train Time
2.502915859222412
Predict Time
0.04924798011779785
OOS Predict Time
0.02422189712524414
LinearSVC - Hinge Squared
5
Training Accuracy
0.9986754966887417
Validation Accuracy (in scope)
0.905
Validation Accuracy (oos)
0.22
Validation Accuracy (TOTAL)
0.9004635761589403
Train Time
2.645447254180908
Predict Time
0.04855489730834961
OOS Predict Time
0.024276018142700195


Best so far for 5, better for 2.5 or 7.5?

In [48]:
regularization = [2.5, 7.5]
for c in regularization:
    print('LinearSVC - Hinge Squared')
    print(c)
    t0 = time.time()
    clf = LinearSVC(random_state=0, tol=0.1, C=c, max_iter=1000000)
    clf.fit(train_x, train_y)
    t1 = time.time()
    labels = clf.predict(train_x)
    train_score = accuracy_score(train_y, labels)
    t2 = time.time()
    val_lab = clf.predict(val_x)
    t3 = time.time()
    val_score = accuracy_score(val_y, val_lab)
    t4 = time.time()
    val_oos_lab = clf.predict(val_oos_x)
    t5 = time.time()
    oos_val_score = accuracy_score(val_oos_y, val_oos_lab)
    print("Training Accuracy")
    print(train_score) 
    print("Validation Accuracy (in scope)")
    print(val_score)
    print("Validation Accuracy (oos)")
    print(oos_val_score)
    print("Validation Accuracy (TOTAL)")
    print(((150*val_score) + oos_val_score)/151)
    print("Train Time")
    print(t1-t0)
    print("Predict Time")
    print(t3-t2)
    print("OOS Predict Time")
    print(t5-t4)

LinearSVC - Hinge Squared
2.5
Training Accuracy
0.9981456953642384
Validation Accuracy (in scope)
0.91
Validation Accuracy (oos)
0.23
Validation Accuracy (TOTAL)
0.9054966887417217
Train Time
2.6298811435699463
Predict Time
0.04633212089538574
OOS Predict Time
0.022237062454223633
LinearSVC - Hinge Squared
7.5
Training Accuracy
0.9988741721854305
Validation Accuracy (in scope)
0.9056666666666666
Validation Accuracy (oos)
0.23
Validation Accuracy (TOTAL)
0.9011920529801324
Train Time
2.6541991233825684
Predict Time
0.045375823974609375
OOS Predict Time
0.022288084030151367


Best found for one-vs-rest: C=2.5, 91% IS, 23% OOS

This is 0.5% worse than one-vs-one in scope, and 22% on out-of-scope data; however it has taken orders of magnitude less time. It will therefore be taken forward as the best model for this particular application domain.

In [48]:
print('LinearSVC - Hinge Squared')
print('2.5')
t0 = time.time()
clf = LinearSVC(random_state=0, tol=0.1, C=2.25, max_iter=1000000, verbose=True)
clf.fit(train_x, train_y)
t1 = time.time()
labels = clf.predict(train_x)
train_score = accuracy_score(train_y, labels)
t2 = time.time()
val_lab = clf.predict(val_x)
t3 = time.time()
val_score = accuracy_score(val_y, val_lab)
t4 = time.time()
val_oos_lab = clf.predict(val_oos_x)
t5 = time.time()
oos_val_score = accuracy_score(val_oos_y, val_oos_lab)
print("Training Accuracy")
print(train_score) 
print("Validation Accuracy (in scope)")
print(val_score)
print("Validation Accuracy (oos)")
print(oos_val_score)
print("Validation Accuracy (TOTAL)")
print(((150*val_score) + oos_val_score)/151)
print("Train Time")
print(t1-t0)
print("Predict Time")
print(t3-t2)
print("OOS Predict Time")
print(t5-t4)

LinearSVC - Hinge Squared
2.5
[LibLinear]Training Accuracy
0.9978145695364239
Validation Accuracy (in scope)
0.9093333333333333
Validation Accuracy (oos)
0.23
Validation Accuracy (TOTAL)
0.9048344370860927
Train Time
3.221323013305664
Predict Time
0.05889105796813965
OOS Predict Time
0.02698802947998047


In [49]:
import pickle
with open('oklsvc.pkl', 'wb') as f:
    pickle.dump(clf, f)

Cell below is for timing classifier over many iterations so takes time to run.

In [52]:
clf = LinearSVC(random_state=0, tol=0.1, C=2.25, max_iter=1000000)


t0 = time.time()
for i in range(100): #time over 100 iterations due to function speed
    clf.fit(train_x, train_y)
t1 = time.time()
print("Train Time (average of 100 iterations)")
print((t1-t0)/100)

t2 = time.time()
for i in range(1000): #time over 1000 iterations due to function speed
    val_lab = clf.predict(val_x)
t3 = time.time()
print("Predict Time (average of 1000 iterations)")
print((t3-t2)/1000)

t4 = time.time()
for i in range(1000): #time over 1000 iterations due to function speed
      val_oos_lab = clf.predict(val_oos_x)
t5 = time.time()
print("OOS Predict Time (average of 1000 iterations)")
print((t5-t4)/1000)



Train Time (average of 100 iterations)
3.715258619785309
Predict Time (average of 1000 iterations)
0.07530556678771973
OOS Predict Time (average of 1000 iterations)
0.03411037397384643
