In [2]:
import pandas as pd
import cudf
from tqdm import tqdm
import spacy

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn.model_selection import train_test_split


from cuml.svm import SVC
import cuml
cuml.common.logger.set_level(1)


import cupy as cp

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

import pickle


In [3]:
df = pd.read_csv('/notebooks/intent-data.csv',header=None,names=["sentence","label"])

In [4]:
svc = SVC(kernel='rbf', C=10, gamma=1, cache_size=2000)

import time 
durations = []
accuracies = []
version=1

for j,pct in enumerate([.1,.2,.3,.4,.5,.6, .7, .8, .9]):
    print(f"Iteration {j+1}")
    start_time = time.time()    

    df_sm = df.sample(n=int(len(df)*pct), random_state=0)
    
    print(f'({pct*100})% Sample : {len(df_sm)}')
    
    print("Encoding Sentences...")
    X = model.encode(df_sm.sentence.values)

    print("Transforming Labels...")
    le.fit(df_sm.label.values)
    y = le.transform(df_sm.label.values)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
    X_train = cp.array(X_train)
    X_test = cp.array(X_test)
    y_train = cp.array(y_train)
    y_test = cp.array(y_test)
    # Create a support vector classifier
    # Fit the classifier using the training data

    print("Model Training Started...")
    svc.fit(X_train, y_train)
    
    print("Model Prediction...")
    y_pred = svc.predict(X_test)
    # Count the number of correct predictions
    n_correct = 0
    for i in range(len(y_test)):
        if y_pred[i] == y_test[i]:
            n_correct += 1
    
    print("Predicted {0} correctly out of {1} test examples".format(n_correct, len(y_test)))

    accuracy=n_correct/len(y_test)*100
    accuracies.append(accuracy)
    
    print(f"Accuracy with {pct*100}%:  {accuracy}")
    
    print(f"Saving model to svc_{int(pct*100)}_pct_v{version}.pkl")

    pickle.dump(svc, open(f"svc_{int(pct*100)}_pct_v{version}.pkl", "wb"))

    end_time = time.time()
    duration=round((end_time-start_time)/60)
    durations.append(duration)
    print(f"Time taken: {duration} minutes ")

    

Iteration 1
(70.0)% Sample : 545509
Predicted 170882 correctly out of 180018 test examples
Accuracy with 70.0%:  94.92495194924952
Time taken: 5 minutes 
Iteration 2
(80.0)% Sample : 623439
Predicted 196055 correctly out of 205735 test examples
Accuracy with 80.0%:  95.29491822004034
Time taken: 5 minutes 
Iteration 3
(90.0)% Sample : 701369
Predicted 220945 correctly out of 231452 test examples
Accuracy with 90.0%:  95.46039783626843
Time taken: 6 minutes 


In [None]:
# Sample (10.0): 77929
# Predicted 22802 correctly out of 25717 test examples
# Accuracy with 10.0%:  88.66508535210173
# Iteration 25717: 0.9293102939923604 

# Sample (20.0): 155859
# Predicted 47234 correctly out of 51434 test examples
# Accuracy with 20.0%:  91.83419527938717
# Iteration 51434: 1.4464227517445882 

# Sample (30.0): 233789
# Predicted 71757 correctly out of 77151 test examples
# Accuracy with 30.0%:  93.00851576778007
# Iteration 77151: 2.013825758298238 

# Sample (40.0): 311719
# Predicted 96473 correctly out of 102868 test examples
# Accuracy with 40.0%:  93.78329509662869
# Iteration 102868: 2.6365856846173603 

# Sample (50.0): 389649
# Predicted 121224 correctly out of 128585 test examples
# Accuracy with 50.0%:  94.27538204300657
# Iteration 128585: 3.286288364728292 

# Sample (60.0): 467579
# Predicted 146123 correctly out of 154302 test examples
# Accuracy with 60.0%:  94.6993558087387
# Iteration 154302: 3.8842630704243977 

# (70.0)% Sample : 545509
# Predicted 170882 correctly out of 180018 test examples
# Accuracy with 70.0%:  94.92495194924952
# Time taken: 5 minutes 

# (80.0)% Sample : 623439
# Predicted 196055 correctly out of 205735 test examples
# Accuracy with 80.0%:  95.29491822004034
# Time taken: 5 minutes 

# (90.0)% Sample : 701369
# Predicted 220945 correctly out of 231452 test examples
# Accuracy with 90.0%:  95.46039783626843
# Time taken: 6 minutes 




# Complete Data

In [5]:

version=1
pct=1

start_time = time.time()    

print("Encoding Sentences...")
X = model.encode(df.sentence.values)

print("Transforming Labels...")
le.fit(df_sm.label.values)
y = le.transform(df.label.values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train = cp.array(X_train)
X_test = cp.array(X_test)
y_train = cp.array(y_train)
y_test = cp.array(y_test)
# Create a support vector classifier
# Fit the classifier using the training data

print("Model Training Started...")
svc.fit(X_train, y_train)

print("Model Prediction...")
y_pred = svc.predict(X_test)
# Count the number of correct predictions
n_correct = 0
for i in range(len(y_test)):
    if y_pred[i] == y_test[i]:
        n_correct += 1

print("Predicted {0} correctly out of {1} test examples".format(n_correct, len(y_test)))

accuracy=n_correct/len(y_test)*100
accuracies.append(accuracy)

print(f"Accuracy with {pct*100}%:  {accuracy}")

print(f"Saving model to svc_{int(pct*100)}_pct_v{version}.pkl")

pickle.dump(svc, open(f"svc_{int(pct*100)}_pct_v{version}.pkl", "wb"))

end_time = time.time()
duration=round((end_time-start_time)/60)
durations.append(duration)
print(f"Time taken: {duration} minutes ")


Encoding Sentences...
Transforming Labels...
Model Training Started...
Model Prediction...
Predicted 245891 correctly out of 257169 test examples
Accuracy with 100%:  95.61455696448638
Saving model to svc_100_pct_v1.pkl
Time taken: 6 minutes 
