# Intent Classification with SVM (SVC from cuML)

**1. Load Libraries**

In [37]:
import time 
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

import cudf
import cupy as cp
import cuml
cuml.common.logger.set_level(1)
from cuml.svm import SVC

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer


**2. Load data**

In [26]:
train_input_path ="../data/intent_data.csv"
eval_input_path ="../data/intent_data_eval.csv"

df = pd.read_csv(train_input_path)
df_eval = pd.read_csv(eval_input_path)

print('Training Dataset Size: ',f"{len(df):,}")
print('Evaluation Dataset Size: ',f"{len(df_eval):,}")

Training Dataset Size:  1,870,411
Evaluation Dataset Size:  82


In [28]:
intent_min_count = df.groupby('label').agg({'sentence':'count'}).min()[0]
print(f'minimum count is {intent_min_count}')

minimum count is 214707


In [29]:
# total # of intents are 31
no_of_intents = 31
intents = df['label'].unique()
df_sampled = pd.DataFrame()
for intent in intents[:no_of_intents]:
    new_df = df[df['label']==intent].sample(n=intent_min_count)
    df_sampled = pd.concat([df_sampled,new_df])
    
print('Size of new DataFrame is :',len(df_sampled))

Size of new DataFrame is : 644121


**3. Encode Sentences**

In [38]:
model = SentenceTransformer('all-MiniLM-L6-v2')
X_train = model.encode(df_sampled.sentence.values)
X_eval = model.encode(df_eval.sentence.values)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

KeyboardInterrupt: 

**4. Train-Test Split**

In [None]:
le.fit(sampled_df.label.values)
y_train = le.transform(df_sampled.label.values)
y_eval = le.transform(df_eval.label.values)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
X_train = cp.array(X_train)
X_eval = cp.array(X_eval)
y_train = cp.array(y_train)
y_eval = cp.array(y_eval)


In [None]:
classes = np.unique(y_train)

**5. Initilize and train model**

In [None]:
svc = SVC(kernel='rbf', C=10, gamma=1, cache_size=2000)
svc.fit(X_train, y_train)


**6. Save Model**

In [None]:
print(f"Saving model to svc_model.pkl")
pickle.dump(svc, open(f"svc_model.pkl", "wb"))


**7. Evaluate Model**

In [None]:
print(classification_report(y_pred=svc.predict(X_eval), y_true=y_eval, labels = ))

In [None]:
# # Count the number of correct predictions
# n_correct = 0
# for i in range(len(y_eval)):
#     if y_pred[i] == y_eval[i]:
#         n_correct += 1
# print("Predicted {0} correctly out of {1} test examples".format(n_correct, len(y_eval)))
# accuracy=n_correct/len(y_test)*100

# print(f"Accuracy: {accuracy}")
