In [25]:
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import pickle
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
def display_num_rows_columns(df):
    print(f"number of rows: {df.shape[0]}")
    print(f"number of columns: {df.shape[1]}")

Download the data from Drive

In [27]:
mnli_data_train = pd.read_csv("/content/drive/MyDrive/ml-project-data-try/mnli_data_train.csv")
mnli_data_val = pd.read_csv("/content/drive/MyDrive/ml-project-data-try/mnli_data_val_test_matched.csv")
mnli_data_test = pd.read_csv("/content/drive/MyDrive/ml-project-data-try/mnli_data_val_test_mismatched.csv")
display_num_rows_columns(mnli_data_train)
mnli_data_train.head()

number of rows: 120000
number of columns: 4


Unnamed: 0,label,genre,premise,hypothesis
0,contradiction,telephone,uh real life,find real
1,entailment,fiction,promise blue grass training suh,promise get blue grass training
2,neutral,government,company plan eventually achieving quantum leap...,many company s plans hindered lack resources
3,entailment,travel,elda noted excellent wine lace making,elda excellent wine
4,contradiction,slate,"one thousand, nine hundred and eighty-six sued...",ian hamilton s biography published without cha...


In [28]:
display_num_rows_columns(mnli_data_val)
mnli_data_val.head()

number of rows: 9815
number of columns: 4


Unnamed: 0,label,genre,premise,hypothesis
0,neutral,slate,new rights nice enough,everyone really likes newest benefits
1,contradiction,government,site includes list award winners searchable da...,government executive articles housed website a...
2,entailment,telephone,uh know mixed emotions uh sometimes like times...,like part would still enjoy seeing someone beat
3,contradiction,telephone,yeah think favorite restaurant always one clos...,favorite restaurants always least hundred mile...
4,contradiction,telephone,know um lot camping,know exactly


In [29]:
display_num_rows_columns(mnli_data_test)
mnli_data_test.head()

number of rows: 9832
number of columns: 4


Unnamed: 0,label,genre,premise,hypothesis
0,contradiction,letters,contribution helped make possible us provide s...,contributions help students education
1,contradiction,verbatim,answer nothing however simple fact dictionarie...,dictionaries indeed exercises bi unique substi...
2,entailment,verbatim,serve classic tuscan meal includes florentine ...,serve meal florentine terrine
3,contradiction,letters,months ago carl newton wrote letter asking con...,carl newton never previous contact
4,entailment,facetoface,earth know lived earth reason know yet,yet know reason lived earth


Create a function, that reads a GloVe embeddings file and creates a dictionary

In [30]:
def load_glove_embeddings(embedding_dim):
    embedding_dict = {}
    with open(f'/content/drive/MyDrive/ml-project-data-try/glove.6B.{embedding_dim}d.txt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype=float)
            embedding_dict[word] = vector
    return embedding_dict

Before moving forward, it is essential to clean dataset from null rows in order to avoid errors

In [31]:
def remove_null_from_df(df, name):
  print(f"Number of null rows in {name}")
  print(df.isnull().sum())
  df = df.dropna().reset_index(drop=True)
  return df

In [32]:
mnli_data_train = remove_null_from_df(mnli_data_train, "MNLI train dataset")
mnli_data_val = remove_null_from_df(mnli_data_val,  "MNLI validation dataset")
mnli_data_test = remove_null_from_df(mnli_data_test,  "MNLI test dataset")

Number of null rows in MNLI train dataset
label           0
genre           0
premise       294
hypothesis    267
dtype: int64
Number of null rows in MNLI validation dataset
label          0
genre          0
premise       27
hypothesis    17
dtype: int64
Number of null rows in MNLI test dataset
label          0
genre          0
premise       24
hypothesis    18
dtype: int64


As we can see, before processing null values, there were some null rows which we removed as a result

In [33]:
text_data = mnli_data_train['premise'] + ' ' + mnli_data_train['hypothesis']
# create a tokenizer object
tokenizer_unique_words = Tokenizer()
# fit tokenizer on text data
tokenizer_unique_words.fit_on_texts(text_data)
# print number of unique words in the dataset
print("Number of unique words in train dataframe:", len(tokenizer_unique_words.word_index))

Number of unique words in train dataframe: 55356


As we can see, there are 55k unique rows in our training data. However, usually in the Tokenizer there are left approximately ~20-25k rows in order to provide effective model running time and do not waste extra computational resources. For this, we will work with top 27.5k frequent words in our training data.

In [34]:
num_samples = int(len(mnli_data_train['premise']))
corpus = mnli_data_train.apply(lambda row: row['premise'] + ' ' + row['hypothesis'], axis=1).tolist()

Create an embedding matrix, which will be used for vectorizing our dataframe

In [35]:
def embedding_matrix(corpus, glove_embedding, embedding_dim):
    # Initialize and fit Keras tokenizer to convert words to integers
    VOCAB_SIZE = 27500
    tokenizer = Tokenizer(num_words=VOCAB_SIZE)
    tokenizer.fit_on_texts(corpus)

    # Get an word-integer dictionary and use that to create an weight matrix
    # i-th column of weight matrix will have the vector of word with integer value i in dictionary
    word_index = tokenizer.word_index
    embed_matrix = np.zeros((len(word_index) + 1, embedding_dim))

    for word, ind in word_index.items():
        # Get the embedding vector from GloVe dictionary, if available
        # Words not in the Glove would have the embedding matrix vector as full zeroes
        embedding_vector = glove_embedding.get(word)

        if embedding_vector is not None:
            embed_matrix[ind] = embedding_vector

    # Store the dictionary as a pickle file to reduce thw overhead of loading
    with open('/content/drive/MyDrive/ml-project-data-try/embedding_matrix.pickle', "wb") as file:
        pickle.dump(embed_matrix, file)

    return embed_matrix, tokenizer

embedding_dim = 200
embedding_dict = load_glove_embeddings(embedding_dim)
embed_matrix, tokenizer = embedding_matrix(corpus, embedding_dict, embedding_dim)

The main part of this notebook is to create a data for models in the needed format. Firstly, premise and hypothesis have to be vectorized, so that the computer and models could understand the input. In addition, all vectors have to be of the same size, which we decided to be of length 65. For this, pad_sequences has to be used, so that if the vectors is of length less than 65, zeroes will be added until the length is exactly 65. Finally, the 'label' column has to be also vectorized using 'to_categorical' function.  

In [36]:
def preprocess_data(df):
    # Define MAX_SEQ_LEN and NUM_CLASSES
    MAX_SEQ_LEN = 65
    NUM_CLASSES = 3
    
    processed_data = []

    label_map = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
    # Replace string labels with numerical labels
    df['label'] = df['label'].replace(label_map)
    df['label'] = df['label'].apply(lambda x: to_categorical(x, num_classes=3))
    
    for index, row in df.iterrows():
        # Get label, premise, and hypothesis from the row
        label = row['label']
        premise = row['premise']
        hypothesis = row['hypothesis']
        
        # Apply tokenizer and pad_sequence to premise and hypothesis
        premise_seq = pad_sequences(tokenizer.texts_to_sequences([premise]), maxlen=MAX_SEQ_LEN, padding='post')[0]
        hypothesis_seq = pad_sequences(tokenizer.texts_to_sequences([hypothesis]), maxlen=MAX_SEQ_LEN, padding='post')[0]
        
        # Append the processed data as a tuple to the list
        processed_data.append((label, premise_seq, hypothesis_seq))
    
    return pd.DataFrame(processed_data, columns=['label', 'premise', 'hypothesis'])


Apply the preprocessing of the data to the train, validation, and testing data.

In [37]:
mnli_data_train_copy = mnli_data_train.copy()
processed_train_data = preprocess_data(mnli_data_train_copy)
processed_train_data.head()

Unnamed: 0,label,premise,hypothesis
0,"[1.0, 0.0, 0.0]","[3, 125, 93, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[87, 125, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0.0, 0.0, 1.0]","[3500, 917, 1742, 702, 11592, 0, 0, 0, 0, 0, 0...","[3500, 16, 917, 1742, 702, 0, 0, 0, 0, 0, 0, 0..."
2,"[0.0, 1.0, 0.0]","[286, 312, 1223, 2826, 18768, 7135, 219, 955, ...","[22, 286, 1, 654, 16291, 1073, 433, 0, 0, 0, 0..."
3,"[0.0, 0.0, 1.0]","[13106, 1242, 983, 1217, 7578, 269, 0, 0, 0, 0...","[13106, 983, 1217, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[1.0, 0.0, 0.0]","[2, 10, 17, 6, 7, 172, 56, 7422, 2006, 2378, 1...","[14517, 9255, 1, 5475, 743, 138, 1814, 0, 0, 0..."


In [38]:
mnli_data_val_copy = mnli_data_val.copy()
processed_val_data = preprocess_data(mnli_data_val_copy)
processed_val_data.head()

Unnamed: 0,label,premise,hypothesis
0,"[0.0, 1.0, 0.0]","[20, 755, 288, 151, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[265, 23, 1820, 5936, 310, 0, 0, 0, 0, 0, 0, 0..."
1,"[1.0, 0.0, 0.0]","[252, 866, 1032, 2121, 10140, 12891, 4548, 69,...","[69, 544, 2493, 3303, 3047, 175, 9896, 0, 0, 0..."
2,"[0.0, 0.0, 1.0]","[3, 4, 3507, 5864, 3, 351, 8, 129, 389, 25, 10...","[8, 104, 5, 50, 398, 1164, 301, 1728, 0, 0, 0,..."
3,"[1.0, 0.0, 0.0]","[12, 15, 894, 1342, 72, 2, 4791, 4, 4791, 62, ...","[894, 886, 72, 239, 6, 382, 121, 91, 0, 0, 0, ..."
4,"[1.0, 0.0, 0.0]","[4, 32, 37, 1387, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[4, 609, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."


In [39]:
mnli_data_test_copy = mnli_data_test.copy()
processed_test_data = preprocess_data(mnli_data_test_copy)
processed_test_data.head()

Unnamed: 0,label,premise,hypothesis
0,"[1.0, 0.0, 0.0]","[2757, 814, 40, 377, 55, 237, 851, 383, 693, 0...","[2130, 99, 851, 693, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,"[1.0, 0.0, 0.0]","[864, 136, 205, 944, 248, 10924, 1516, 616, 2,...","[1812, 10924, 1516, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,"[0.0, 0.0, 1.0]","[961, 3018, 12979, 2632, 866, 11433, 49, 5682,...","[961, 2632, 11433, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[1.0, 0.0, 0.0]","[427, 254, 11121, 12422, 908, 874, 1861, 780, ...","[11121, 12422, 31, 1604, 1997, 0, 0, 0, 0, 0, ..."
4,"[0.0, 0.0, 1.0]","[1556, 4, 624, 1556, 451, 4, 277, 0, 0, 0, 0, ...","[277, 4, 451, 624, 1556, 0, 0, 0, 0, 0, 0, 0, ..."


Save processed data to Drive in order to work with them in next notebooks.

In [40]:
processed_train_data.to_csv('/content/drive/MyDrive/ml-project-data-try/nli_train.csv', index=False)
processed_val_data.to_csv('/content/drive/MyDrive/ml-project-data-try/nli_val.csv', index=False)
processed_test_data.to_csv('/content/drive/MyDrive/ml-project-data-try/nli_test.csv', index=False)