### Imports

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_df_and_display_num_rows_columns(filepath):
    df = pd.read_csv(filepath)
    print(f"Reading file: {filepath}")
    print(f"number of rows: {df.shape[0]}")
    print(f"number of columns: {df.shape[1]}")
    return df

In [3]:
import pickle
def save_to_pickle(df, filename):
  df.to_pickle(filename)

In [4]:
df_train = read_df_and_display_num_rows_columns("snli processed/snli_data_train.csv")
df_val = read_df_and_display_num_rows_columns("snli processed/snli_data_val.csv")
df_test = read_df_and_display_num_rows_columns("snli processed/snli_data_test.csv")

Reading file: snli processed/snli_data_train.csv
number of rows: 549367
number of columns: 3
Reading file: snli processed/snli_data_val.csv
number of rows: 9842
number of columns: 3
Reading file: snli processed/snli_data_test.csv
number of rows: 9824
number of columns: 3


In [5]:
df_train.head()

Unnamed: 0,label,premise,hypothesis
0,neutral,person horse jumps broken airplane,person training horse competition
1,contradiction,person horse jumps broken airplane,person diner ordering omelette
2,entailment,person horse jumps broken airplane,person outdoors horse
3,neutral,children smiling waving camera,smiling parents
4,entailment,children smiling waving camera,children present


In [6]:
def remove_null_from_df(df, name):
  print(f"Number of null rows in {name}")
  print(df.isnull().sum())
  df = df.dropna().reset_index(drop=True)
  return df

In [7]:
data_train = remove_null_from_df(df_train, "SNLI train dataset")
data_val = remove_null_from_df(df_val,  "SNLI validation dataset")

Number of null rows in SNLI train dataset
label          0
premise        0
hypothesis    17
dtype: int64
Number of null rows in SNLI validation dataset
label         0
premise       0
hypothesis    0
dtype: int64


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
import scipy.sparse

In [9]:
corpus = data_train['premise'] + ' ' + data_train['hypothesis']
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

In [10]:
tfidf_premise = vectorizer.transform(data_train['premise'])
tfidf_hypothesis = vectorizer.transform(data_train['hypothesis'])
tfidf_features = scipy.sparse.hstack((tfidf_premise, tfidf_hypothesis))

In [11]:
label_map = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
# Replace string labels with numerical labels
tfidf_labels = data_train['label'].replace(label_map)

In [12]:
from sklearn.metrics import accuracy_score

In [16]:
def train_SVM(tfidf_features, tfidf_labels, validation_features, validation_labels):
    # Create the SVM model
    svc = svm.SVC(kernel='rbf', random_state = 0, gamma='auto', max_iter=2000)

    # Train the model on the training set
    svc.fit(tfidf_features, tfidf_labels)

    # Predict the labels for the validation set
    predictions = svc.predict(validation_features)

    with open('saved/SVM.pickle', "wb") as file:
        pickle.dump(svc, file)

    # Print the classification report
    print(classification_report(validation_labels, predictions))
    return predictions

In [17]:
tfidf_premise_val = vectorizer.transform(data_val['premise'])
tfidf_hypothesis_val = vectorizer.transform(data_val['hypothesis'])
tfidf_features_val = scipy.sparse.hstack((tfidf_premise_val, tfidf_hypothesis_val))
val_labels = data_val['label'].replace(label_map)

In [18]:
predictions = train_SVM(tfidf_features, tfidf_labels, tfidf_features_val, val_labels)
# Calculate the accuracy of the model
results = pd.DataFrame({'Real Values': val_labels, 'Predicted Values': predictions})
save_to_pickle(results, 'results/SVM_results.pickle')
results

              precision    recall  f1-score   support

           0       0.54      0.29      0.38      3278
           1       0.42      0.43      0.43      3235
           2       0.41      0.58      0.48      3329

    accuracy                           0.44      9842
   macro avg       0.46      0.44      0.43      9842
weighted avg       0.46      0.44      0.43      9842



Unnamed: 0,Real Values,Predicted Values
0,1,0
1,2,2
2,0,2
3,2,2
4,1,2
...,...,...
9837,2,1
9838,0,1
9839,2,2
9840,0,1


In [19]:
accuracy = accuracy_score(val_labels, predictions)
print('Accuracy:', accuracy)

Accuracy: 0.4364966470229628
