### Imports

In [1]:
import pandas as pd
import numpy as np
import pickle

import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
def read_df_and_display_num_rows_columns(filepath):
    with open(filepath, "rb") as file:
      data = pickle.load(file)
    print(f"Reading file: {filepath}")
    print(f"number of rows: {data.shape[0]}")
    print(f"number of columns: {data.shape[1]}")
    return data

In [3]:
def save_to_pickle(df, filename):
  df.to_pickle(filename)

In [4]:
df_train = read_df_and_display_num_rows_columns("/content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/train.pickle")
df_val = read_df_and_display_num_rows_columns("/content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/validation.pickle")
df_test = read_df_and_display_num_rows_columns("/content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/test.pickle")

Reading file: /content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/train.pickle
number of rows: 549350
number of columns: 6
Reading file: /content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/validation.pickle
number of rows: 9842
number of columns: 6
Reading file: /content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/test.pickle
number of rows: 9824
number of columns: 6


In [5]:
df_train.head()

Unnamed: 0,premise,hypothesis,label,premise_embedded,hypothesis_embedded,cosine_score
0,person horse jumps broken airplane,person training horse competition,neutral,"[-0.010034064, 0.0028005873, 0.0631274, 0.0080...","[-0.050083168, 0.020744428, 0.006436512, -0.00...",0.387597
1,person horse jumps broken airplane,person diner ordering omelette,contradiction,"[-0.010034064, 0.0028005873, 0.0631274, 0.0080...","[0.01991658, 0.08336658, 0.05666334, -0.005686...",0.048144
2,person horse jumps broken airplane,person outdoors horse,entailment,"[-0.010034064, 0.0028005873, 0.0631274, 0.0080...","[-0.015296373, 0.051944993, 0.06472147, -0.013...",0.514768
3,children smiling waving camera,smiling parents,neutral,"[-0.021928668, 0.06378024, 0.017834725, -0.032...","[-0.038321618, 0.09435368, 0.0064652245, 0.039...",0.594589
4,children smiling waving camera,children present,entailment,"[-0.021928668, 0.06378024, 0.017834725, -0.032...","[-0.018334052, 0.032883544, -0.014544778, 0.03...",0.569901


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
import scipy.sparse

In [7]:
corpus = df_train['premise'] + ' ' + df_train['hypothesis']
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

In [8]:
tfidf_premise = vectorizer.transform(df_train['premise'])
tfidf_hypothesis = vectorizer.transform(df_train['hypothesis'])
tfidf_features = scipy.sparse.hstack((tfidf_premise, tfidf_hypothesis))

In [9]:
label_map = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
    # Replace string labels with numerical labels
tfidf_labels = df_train['label'].replace(label_map)

In [10]:
from sklearn.metrics import accuracy_score

In [13]:
def train_SVM(tfidf_features, tfidf_labels, validation_features, validation_labels):
    # Create the SVM model
    svc = svm.SVC(kernel='rbf', random_state = 0, gamma='auto', max_iter=3000)

    # Train the model on the training set
    svc.fit(tfidf_features, tfidf_labels)

    # Predict the labels for the testing set
    predictions = svc.predict(validation_features)

    with open('/content/drive/MyDrive/ml-project-data-try/trained-models/SVM/NLI-SVM.pickle', "wb") as file:
        pickle.dump(svc, file)

    # Print the classification report
    print(classification_report(validation_labels, predictions))
    return predictions

In [12]:
tfidf_premise_val = vectorizer.transform(df_val['premise'])
tfidf_hypothesis_val = vectorizer.transform(df_val['hypothesis'])
tfidf_features_val = scipy.sparse.hstack((tfidf_premise_val, tfidf_hypothesis_val))
val_labels = df_val['label'].replace(label_map)

In [14]:
predictions = train_SVM(tfidf_features, tfidf_labels, tfidf_features_val, val_labels)
# Calculate the accuracy of the model
results = pd.DataFrame({'Real Values': val_labels, 'Predicted Values': predictions})
save_to_pickle(results, '/content/drive/MyDrive/ml-project-data-try/trained-models/SVM/NLI-SVM-result.pickle')
results

              precision    recall  f1-score   support

           0       0.58      0.15      0.24      3278
           1       0.38      0.69      0.49      3235
           2       0.44      0.41      0.42      3329

    accuracy                           0.42      9842
   macro avg       0.47      0.42      0.38      9842
weighted avg       0.47      0.42      0.38      9842



Unnamed: 0,Real Values,Predicted Values
0,1,1
1,2,2
2,0,2
3,2,2
4,1,2
...,...,...
9837,2,1
9838,0,1
9839,2,2
9840,0,1


In [15]:
accuracy = accuracy_score(val_labels, predictions)
print('Accuracy:', accuracy)

Accuracy: 0.4161755740703109
