### Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
def read_df_and_display_num_rows_columns(filepath):
    with open(filepath, "rb") as file:
      data = pickle.load(file)
    print(f"Reading file: {filepath}")
    print(f"number of rows: {data.shape[0]}")
    print(f"number of columns: {data.shape[1]}")
    return data

In [3]:
def save_to_pickle(df, filename):
  df.to_pickle(filename)

In [4]:
df_train = read_df_and_display_num_rows_columns("/content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/train.pickle")
df_val = read_df_and_display_num_rows_columns("/content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/validation.pickle")
df_test = read_df_and_display_num_rows_columns("/content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/test.pickle")

Reading file: /content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/train.pickle
number of rows: 549350
number of columns: 6
Reading file: /content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/validation.pickle
number of rows: 9842
number of columns: 6
Reading file: /content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/test.pickle
number of rows: 9824
number of columns: 6


In [5]:
df_train.head()

Unnamed: 0,premise,hypothesis,label,premise_embedded,hypothesis_embedded,cosine_score
0,person horse jumps broken airplane,person training horse competition,neutral,"[-0.010034064, 0.0028005873, 0.0631274, 0.0080...","[-0.050083168, 0.020744428, 0.006436512, -0.00...",0.387597
1,person horse jumps broken airplane,person diner ordering omelette,contradiction,"[-0.010034064, 0.0028005873, 0.0631274, 0.0080...","[0.01991658, 0.08336658, 0.05666334, -0.005686...",0.048144
2,person horse jumps broken airplane,person outdoors horse,entailment,"[-0.010034064, 0.0028005873, 0.0631274, 0.0080...","[-0.015296373, 0.051944993, 0.06472147, -0.013...",0.514768
3,children smiling waving camera,smiling parents,neutral,"[-0.021928668, 0.06378024, 0.017834725, -0.032...","[-0.038321618, 0.09435368, 0.0064652245, 0.039...",0.594589
4,children smiling waving camera,children present,entailment,"[-0.021928668, 0.06378024, 0.017834725, -0.032...","[-0.018334052, 0.032883544, -0.014544778, 0.03...",0.569901


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
import scipy.sparse

In [7]:
corpus = df_train['premise'] + ' ' + df_train['hypothesis']
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

In [8]:
tfidf_premise = vectorizer.transform(df_train['premise'])
tfidf_hypothesis = vectorizer.transform(df_train['hypothesis'])
tfidf_features = scipy.sparse.hstack((tfidf_premise, tfidf_hypothesis))

In [9]:
tfidf_labels = df_train['cosine_score']

In [13]:
def train_SVR(tfidf_features, tfidf_labels, validation_features, validation_labels):
    # Create the SVM model
    svr = svm.SVR(kernel='rbf', gamma='auto', max_iter = 3000)

    # Train the model on the training set
    svr.fit(tfidf_features, tfidf_labels)

    # Predict the labels for the testing set
    predictions = svr.predict(validation_features)

    with open('/content/drive/MyDrive/ml-project-data-try/trained-models/SVM/STS-SVM.pickle', "wb") as file:
        pickle.dump(svr, file)

    # Print the classification report
    return predictions

In [11]:
tfidf_premise_val = vectorizer.transform(df_val['premise'])
tfidf_hypothesis_val = vectorizer.transform(df_val['hypothesis'])
tfidf_features_val = scipy.sparse.hstack((tfidf_premise_val, tfidf_hypothesis_val))
val_labels = df_val['cosine_score']

In [14]:
predictions = train_SVR(tfidf_features, tfidf_labels, tfidf_features_val, val_labels)
# Calculate the accuracy of the model
results = pd.DataFrame({'Real Values': val_labels, 'Predicted Values': predictions})
save_to_pickle(results, '/content/drive/MyDrive/ml-project-data-try/trained-models/LR/STS-SVM-result.pickle')
results

Unnamed: 0,Real Values,Predicted Values
0,0.583258,0.436585
1,0.763924,0.436676
2,0.069606,0.436803
3,0.753461,0.437093
4,0.595102,0.437102
...,...,...
9837,0.514098,0.436871
9838,0.354324,0.436650
9839,0.419467,0.437762
9840,0.295545,0.436630


In [15]:
# Calculate evaluation metrics
mse = mean_squared_error(val_labels, predictions)
mae = mean_absolute_error(val_labels, predictions)

# Print evaluation results
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

Mean Squared Error: 0.051308666981074426
Mean Absolute Error: 0.18517584282087765
