### Imports

In [1]:
import pandas as pd
import pickle
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

### Data Loading

In [21]:
df_train = pd.read_csv("/content/drive/MyDrive/ml-project-data-try/snli-preprocessed/snli_data_train.csv")
# df_train.dropna(inplace=True)
# df_train.reset_index(drop=True)
print(f"number of rows: {df_train.shape[0]}")
print(f"number of columns: {df_train.shape[1]}")
df_train.head()

number of rows: 549367
number of columns: 3


Unnamed: 0,label,premise,hypothesis
0,neutral,person horse jumps broken airplane,person training horse competition
1,contradiction,person horse jumps broken airplane,person diner ordering omelette
2,entailment,person horse jumps broken airplane,person outdoors horse
3,neutral,children smiling waving camera,smiling parents
4,entailment,children smiling waving camera,children present


In [24]:
df_train = df_train.dropna().reset_index(drop=True)
print(f"number of rows: {df_train.shape[0]}")
print(f"number of columns: {df_train.shape[1]}")
df_train.tail()

number of rows: 549350
number of columns: 3


Unnamed: 0,label,premise,hypothesis
549345,contradiction,four dirty barefooted children,four kids awards cleanest feet
549346,neutral,four dirty barefooted children,four homeless children shoes stolen feet dirty
549347,neutral,man surfing bodysuit beautiful blue water,man bodysuit competing surfing competition
549348,contradiction,man surfing bodysuit beautiful blue water,man business suit heading board meeting
549349,entailment,man surfing bodysuit beautiful blue water,beautiful blue water man bodysuit surfing


In [25]:
df_val = pd.read_csv("/content/drive/MyDrive/ml-project-data-try/snli-preprocessed/snli_data_val.csv")
df_val.dropna(inplace=True)
print(f"number of rows: {df_val.shape[0]}")
print(f"number of columns: {df_val.shape[1]}")
df_val.head()

number of rows: 9842
number of columns: 3


Unnamed: 0,label,premise,hypothesis
0,neutral,two women embracing holding go packages,sisters hugging goodbye holding go packages ea...
1,entailment,two women embracing holding go packages,two woman holding packages
2,contradiction,two women embracing holding go packages,men fighting outside deli
3,entailment,two young children blue jerseys one number nin...,two kids numbered jerseys wash hands
4,neutral,two young children blue jerseys one number nin...,two kids ballgame wash hands


In [26]:
df_test = pd.read_csv("/content/drive/MyDrive/ml-project-data-try/snli-preprocessed/snli_data_test.csv")
df_test.dropna(inplace=True)
print(f"number of rows: {df_test.shape[0]}")
print(f"number of columns: {df_test.shape[1]}")
df_test.head()

number of rows: 9824
number of columns: 3


Unnamed: 0,label,premise,hypothesis
0,neutral,church choir sings masses sing joyous songs bo...,church cracks ceiling
1,entailment,church choir sings masses sing joyous songs bo...,church filled song
2,contradiction,church choir sings masses sing joyous songs bo...,choir singing baseball game
3,neutral,woman green headscarf blue shirt big grin,woman young
4,entailment,woman green headscarf blue shirt big grin,woman happy


### Word embedding

#### Train data

In [27]:
hypothesis_train = np.array(df_train['hypothesis'])
premise_train = np.array(df_train['premise'])

In [28]:
embeddings_hypothesis_train = model.encode(hypothesis_train, show_progress_bar=True, convert_to_tensor=True)
embeddings_premise_train = model.encode(premise_train, convert_to_tensor=True)

Batches:   0%|          | 0/17168 [00:00<?, ?it/s]

In [29]:
df_train['cosine_scores'] = df_train.apply(lambda row: util.cos_sim(embeddings_hypothesis_train[row.name].cpu().numpy(), embeddings_premise_train[row.name].cpu().numpy()), axis=1)
df_train.head()

Unnamed: 0,label,premise,hypothesis,cosine_scores
0,neutral,person horse jumps broken airplane,person training horse competition,[[tensor(0.3876)]]
1,contradiction,person horse jumps broken airplane,person diner ordering omelette,[[tensor(0.0481)]]
2,entailment,person horse jumps broken airplane,person outdoors horse,[[tensor(0.5148)]]
3,neutral,children smiling waving camera,smiling parents,[[tensor(0.5946)]]
4,entailment,children smiling waving camera,children present,[[tensor(0.5699)]]


In [30]:
df_train['cosine_scores'] = df_train['cosine_scores'].apply(lambda x: x.item())
df_train.head()

Unnamed: 0,label,premise,hypothesis,cosine_scores
0,neutral,person horse jumps broken airplane,person training horse competition,0.387597
1,contradiction,person horse jumps broken airplane,person diner ordering omelette,0.048144
2,entailment,person horse jumps broken airplane,person outdoors horse,0.514768
3,neutral,children smiling waving camera,smiling parents,0.594589
4,entailment,children smiling waving camera,children present,0.569901


In [31]:
def preprocess_data(df, embedded_hypothesis, embedded_premise):
    processed_data = []
    
    for index, row in df.iterrows():
        # Get label, premise, and hypothesis from the row
        label = row['label']
        premise = row['premise']
        hypothesis = row['hypothesis']

        hypothesis_embedded = embedded_hypothesis[index].cpu().numpy()
        premise_embedded = embedded_premise[index].cpu().numpy()
        cosine_score = row['cosine_scores']
        
        # Append the processed data as a tuple to the list
        processed_data.append((premise, hypothesis, label, premise_embedded, hypothesis_embedded, cosine_score))
    
    return pd.DataFrame(processed_data, columns=['premise', 'hypothesis', 'label', 'premise_embedded', 'hypothesis_embedded', 'cosine_score'])

In [32]:
df_train_final = df_train.copy()
processed_train_data = preprocess_data(df_train_final, embeddings_hypothesis_train, embeddings_premise_train) 
processed_train_data.head()

Unnamed: 0,premise,hypothesis,label,premise_embedded,hypothesis_embedded,cosine_score
0,person horse jumps broken airplane,person training horse competition,neutral,"[-0.010034064, 0.0028005873, 0.0631274, 0.0080...","[-0.050083168, 0.020744428, 0.006436512, -0.00...",0.387597
1,person horse jumps broken airplane,person diner ordering omelette,contradiction,"[-0.010034064, 0.0028005873, 0.0631274, 0.0080...","[0.01991658, 0.08336658, 0.05666334, -0.005686...",0.048144
2,person horse jumps broken airplane,person outdoors horse,entailment,"[-0.010034064, 0.0028005873, 0.0631274, 0.0080...","[-0.015296373, 0.051944993, 0.06472147, -0.013...",0.514768
3,children smiling waving camera,smiling parents,neutral,"[-0.021928668, 0.06378024, 0.017834725, -0.032...","[-0.038321618, 0.09435368, 0.0064652245, 0.039...",0.594589
4,children smiling waving camera,children present,entailment,"[-0.021928668, 0.06378024, 0.017834725, -0.032...","[-0.018334052, 0.032883544, -0.014544778, 0.03...",0.569901


#### Validation data

In [33]:
hypothesis_val = np.array(df_val['hypothesis'])
premise_val = np.array(df_val['premise'])

In [34]:
embeddings_hypothesis_val = model.encode(hypothesis_val, show_progress_bar=True, convert_to_tensor=True)
embeddings_premise_val = model.encode(premise_val, convert_to_tensor=True)

Batches:   0%|          | 0/308 [00:00<?, ?it/s]

In [35]:
df_val['cosine_scores'] = df_val.apply(lambda row: util.cos_sim(embeddings_hypothesis_val[row.name].cpu().numpy(), embeddings_premise_val[row.name].cpu().numpy()), axis=1)
df_val.head()

Unnamed: 0,label,premise,hypothesis,cosine_scores
0,neutral,two women embracing holding go packages,sisters hugging goodbye holding go packages ea...,[[tensor(0.5833)]]
1,entailment,two women embracing holding go packages,two woman holding packages,[[tensor(0.7639)]]
2,contradiction,two women embracing holding go packages,men fighting outside deli,[[tensor(0.0696)]]
3,entailment,two young children blue jerseys one number nin...,two kids numbered jerseys wash hands,[[tensor(0.7535)]]
4,neutral,two young children blue jerseys one number nin...,two kids ballgame wash hands,[[tensor(0.5951)]]


In [36]:
df_val['cosine_scores'] = df_val['cosine_scores'].apply(lambda x: x.item())
df_val.head()

Unnamed: 0,label,premise,hypothesis,cosine_scores
0,neutral,two women embracing holding go packages,sisters hugging goodbye holding go packages ea...,0.583258
1,entailment,two women embracing holding go packages,two woman holding packages,0.763924
2,contradiction,two women embracing holding go packages,men fighting outside deli,0.069606
3,entailment,two young children blue jerseys one number nin...,two kids numbered jerseys wash hands,0.753461
4,neutral,two young children blue jerseys one number nin...,two kids ballgame wash hands,0.595102


In [37]:
df_val_final = df_val.copy()
processed_val_data = preprocess_data(df_val_final, embeddings_hypothesis_val, embeddings_premise_val) 
processed_val_data.head()

Unnamed: 0,premise,hypothesis,label,premise_embedded,hypothesis_embedded,cosine_score
0,two women embracing holding go packages,sisters hugging goodbye holding go packages ea...,neutral,"[0.0042251493, -0.003406235, -0.0019028404, 0....","[0.024690624, 0.09049612, 0.030129503, -0.0052...",0.583258
1,two women embracing holding go packages,two woman holding packages,entailment,"[0.0042251493, -0.003406235, -0.0019028404, 0....","[-0.07236776, -0.020317988, -0.034565713, 0.02...",0.763924
2,two women embracing holding go packages,men fighting outside deli,contradiction,"[0.0042251493, -0.003406235, -0.0019028404, 0....","[0.035375293, 0.042607836, -0.0109004835, 0.00...",0.069606
3,two young children blue jerseys one number nin...,two kids numbered jerseys wash hands,entailment,"[-0.08395465, 0.033306517, 0.05497519, -0.0349...","[-0.085580505, 0.06142286, 0.04118396, -0.0191...",0.753461
4,two young children blue jerseys one number nin...,two kids ballgame wash hands,neutral,"[-0.08395465, 0.033306517, 0.05497519, -0.0349...","[-0.035818074, 0.0477772, 0.06401619, -0.03154...",0.595102


#### Test data

In [38]:
hypothesis_test = np.array(df_test['hypothesis'])
premise_test = np.array(df_test['premise'])

In [39]:
embeddings_hypothesis_test = model.encode(hypothesis_test, show_progress_bar=True, convert_to_tensor=True)
embeddings_premise_test = model.encode(premise_test, convert_to_tensor=True)

Batches:   0%|          | 0/307 [00:00<?, ?it/s]

In [40]:
df_test['cosine_scores'] = df_test.apply(lambda row: util.cos_sim(embeddings_hypothesis_test[row.name].cpu().numpy(), embeddings_premise_test[row.name].cpu().numpy()), axis=1)
df_test.head()

Unnamed: 0,label,premise,hypothesis,cosine_scores
0,neutral,church choir sings masses sing joyous songs bo...,church cracks ceiling,[[tensor(0.3420)]]
1,entailment,church choir sings masses sing joyous songs bo...,church filled song,[[tensor(0.6649)]]
2,contradiction,church choir sings masses sing joyous songs bo...,choir singing baseball game,[[tensor(0.5633)]]
3,neutral,woman green headscarf blue shirt big grin,woman young,[[tensor(0.3099)]]
4,entailment,woman green headscarf blue shirt big grin,woman happy,[[tensor(0.5116)]]


In [41]:
df_test['cosine_scores'] = df_test['cosine_scores'].apply(lambda x: x.item())
df_test.head()

Unnamed: 0,label,premise,hypothesis,cosine_scores
0,neutral,church choir sings masses sing joyous songs bo...,church cracks ceiling,0.342046
1,entailment,church choir sings masses sing joyous songs bo...,church filled song,0.664867
2,contradiction,church choir sings masses sing joyous songs bo...,choir singing baseball game,0.56327
3,neutral,woman green headscarf blue shirt big grin,woman young,0.309873
4,entailment,woman green headscarf blue shirt big grin,woman happy,0.511601


In [42]:
df_test_final = df_test.copy()
processed_test_data = preprocess_data(df_test_final, embeddings_hypothesis_test, embeddings_premise_test) 
processed_test_data.head()

Unnamed: 0,premise,hypothesis,label,premise_embedded,hypothesis_embedded,cosine_score
0,church choir sings masses sing joyous songs bo...,church cracks ceiling,neutral,"[0.050062012, 0.01916108, 0.009861431, 0.02031...","[-0.025226736, 0.042158153, 0.1098506, -0.0467...",0.342046
1,church choir sings masses sing joyous songs bo...,church filled song,entailment,"[0.050061963, 0.01916108, 0.009861442, 0.02031...","[-0.03397217, 0.045834888, 0.023162806, -0.018...",0.664867
2,church choir sings masses sing joyous songs bo...,choir singing baseball game,contradiction,"[0.050061963, 0.01916108, 0.009861442, 0.02031...","[0.029779846, 0.062454868, -0.040203903, -0.09...",0.56327
3,woman green headscarf blue shirt big grin,woman young,neutral,"[-0.03839451, 0.034623183, 0.047964565, -0.011...","[-0.015306637, 0.0011524818, -0.01608309, -0.0...",0.309873
4,woman green headscarf blue shirt big grin,woman happy,entailment,"[-0.03839451, 0.034623183, 0.047964565, -0.011...","[0.016856875, 0.0070528723, -0.025405552, 0.00...",0.511601


### Save data to pickle files


In [43]:
def save_to_pickle(df, filename):
  df.to_pickle(filename)

In [46]:
save_to_pickle(processed_train_data, '/content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/train.pickle')
save_to_pickle(processed_val_data, '/content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/validation.pickle')
save_to_pickle(processed_test_data, '/content/drive/MyDrive/ml-project-data-try/snli-sbert-dataset/test.pickle')