In [None]:
import spacy
from spacy.tokenizer import Tokenizer

In [None]:
# Download the 'en_core_web_lg' model
!python -m spacy download en_core_web_lg

# Load the model
nlp = spacy.load('en_core_web_lg')
tokenizer = Tokenizer(nlp.vocab)


2023-11-16 07:03:30.164550: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-16 07:03:30.164601: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-16 07:03:30.164624: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully inst

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
with open("/content/train.tsv", 'r', encoding='utf-8') as f:
    train = f.readlines()

with open("/content/dev.tsv", 'r', encoding='utf-8') as f:
    dev = f.readlines()

with open("/content/test.tsv", 'r', encoding='utf-8') as f:
    test = f.readlines()

In [None]:
def load(file):
  Quality, ID1, ID2, string1, string2 = [], [], [], [], []
  for i in range (1, len(file)):
    col = file[i].split('\t')
    Quality.append(col[0])

    ID1.append(col[1])
    ID2.append(col[2])

    string1.append(col[3])
    string2.append(col[4])

  df = {'quality': Quality, 'id1': ID1, 'id2': ID2, 'sentence1': string1, 'sentence2': string2}
  return df

In [None]:
df = load(train)
train_df = pd.DataFrame(df)

df = load(dev)
dev_df = pd.DataFrame(df)

df = load(test)
test_df = pd.DataFrame(df)

In [None]:
print("Train Target Distribution")
print(train_df.quality.value_counts())

Train Target Distribution
1    2407
0    1169
Name: quality, dtype: int64


In [None]:
print("Val Target Distribution")
print(dev_df.quality.value_counts())

Val Target Distribution
1    346
0    154
Name: quality, dtype: int64


In [None]:
print("Test Target Distribution")
print(test_df.quality.value_counts())

Test Target Distribution
1    1147
0     578
Name: quality, dtype: int64


# **Tokenization**

In [None]:
print('tokenize sentence1 and sentence2...')
train_df['tokenized_sentence1'] = train_df['sentence1'].apply(lambda x: [ token.text for token in tokenizer(x)])
train_df['tokenized_sentence2'] = train_df['sentence2'].apply(lambda x: [ token.text for token in tokenizer(x)])
dev_df['tokenized_sentence1'] = dev_df['sentence1'].apply(lambda x: [ token.text for token in tokenizer(x)])
dev_df['tokenized_sentence2'] = dev_df['sentence2'].apply(lambda x: [ token.text for token in tokenizer(x)])
test_df['tokenized_sentence1'] = test_df['sentence1'].apply(lambda x: [ token.text for token in tokenizer(x)])
test_df['tokenized_sentence2'] = test_df['sentence2'].apply(lambda x: [ token.text for token in tokenizer(x)])

tokenize sentence1 and sentence2...


In [None]:
train_df.head(5)

Unnamed: 0,quality,id1,id2,sentence1,sentence2,tokenized_sentence1,tokenized_sentence2
0,1,702876,702977,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother,, whom, he, cal...","[Referring, to, him, as, only, ""the, witness"",..."
1,0,2108705,2108831,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...,"[Yucaipa, owned, Dominick's, before, selling, ...","[Yucaipa, bought, Dominick's, in, 1995, for, $..."
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10,, the, ship's, owners, had, publ..."
3,0,3344667,3344648,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ...","[Around, 0335, GMT,, Tab, shares, were, up, 19...","[Tab, shares, jumped, 20, cents,, or, 4.6%,, t..."
4,1,1236820,1236712,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $2.11,, or, about, 11, perc...","[PG&E, Corp., shares, jumped, $1.63, or, 8, pe..."


# Encoding

In [None]:
from gensim.models import KeyedVectors

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
EMBEDDING_FILE = '/content/drive/MyDrive/ColabNotebooks/GoogleNews-vectors-negative300.bin.gz'

In [None]:
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [None]:
# Create dictionary
voc_dict = dict()
inverse_vocabulary = ['<none>']
stops = set(stopwords.words('english'))

In [None]:
for index, row in train_df.iterrows():
    for col in ['tokenized_sentence1', 'tokenized_sentence2']:
        text_to_list = []

        for word in row[col]:

            # Remove if word is stopword
            if (word in stops) and (word not in word2vec.key_to_index):
                continue

            # add word to dict if not in dict
            if word not in voc_dict:
                voc_dict[word] = len(inverse_vocabulary)
                text_to_list.append(len(inverse_vocabulary))
                inverse_vocabulary.append(word)
            else:
                text_to_list.append(voc_dict[word])

        # replace tokenized sentence with numbers
        train_df.at[index, col] = text_to_list



In [None]:
for index, row in dev_df.iterrows():
    for col in ['tokenized_sentence1', 'tokenized_sentence2']:
        text_to_list = []

        for word in row[col]:

            # Remove if word is stopword
            if (word in stops) and (word not in word2vec.key_to_index):
                continue

            # add word to dict if not in dict
            if word not in voc_dict:
                voc_dict[word] = len(inverse_vocabulary)
                text_to_list.append(len(inverse_vocabulary))
                inverse_vocabulary.append(word)
            else:
                text_to_list.append(voc_dict[word])

        # replace tokenized sentence with numbers
        dev_df.at[index, col] = text_to_list

for index, row in test_df.iterrows():
    for col in ['tokenized_sentence1', 'tokenized_sentence2']:
        text_to_list = []

        for word in row[col]:

            # Remove if word is stopword
            if (word in stops) and (word not in word2vec.key_to_index):
                continue

            # add word to dict if not in dict
            if word not in voc_dict:
                voc_dict[word] = len(inverse_vocabulary)
                text_to_list.append(len(inverse_vocabulary))
                inverse_vocabulary.append(word)
            else:
                text_to_list.append(voc_dict[word])

        # replace tokenized sentence with numbers
        test_df.at[index, col] = text_to_list



# Embedding matrix

In [None]:
# embedding matrix
np.random.seed(42)
embedding_dim = 300

# create random value
embeddings = 1 * np.random.randn(len(voc_dict) + 1, embedding_dim)
embeddings[0] = 0

for word, index in voc_dict.items():

    # if the word is in word2vec.vocab replace it with the vector of the word
    if word in word2vec.key_to_index:
        embeddings[index] = word2vec.get_vector(word)

In [None]:
test_df.head(2)

Unnamed: 0,quality,id1,id2,sentence1,sentence2,tokenized_sentence1,tokenized_sentence2
0,1,1089874,1089925,"PCCW's chief operating officer, Mike Butcher, ...",Current Chief Operating Officer Mike Butcher a...,"[22607, 2273, 2809, 5482, 6009, 22608, 7963, 2...","[22611, 3187, 2814, 4030, 6009, 22612, 10256, ..."
1,1,3019446,3019327,The world's two largest automakers said their ...,Domestic sales at both GM and No. 2 Ford Motor...,"[83, 6774, 366, 598, 22614, 166, 203, 146, 132...","[22615, 1327, 71, 1975, 14946, 784, 190, 5604,..."


In [None]:
import itertools
from keras.preprocessing.sequence import pad_sequences

In [None]:
# save x and y for each data set with the quality
target = 'quality'

X_train = {'sentence1': train_df['tokenized_sentence1'],
           'sentence2': train_df['tokenized_sentence2']}

X_val = {'sentence1': dev_df['tokenized_sentence1'],
         'sentence2': dev_df['tokenized_sentence2']}

X_test = {'sentence1': test_df['tokenized_sentence1'],
          'sentence2': test_df['tokenized_sentence2']}

y_train = train_df[target].values
y_val = dev_df[target].values
y_test = test_df[target].values

In [None]:
# zero padding
max_len = max(
    train_df['tokenized_sentence1'].dropna().map(lambda x: len(x)).max(),
    train_df['tokenized_sentence2'].dropna().map(lambda x: len(x)).max(),
    dev_df['tokenized_sentence1'].dropna().map(lambda x: len(x)).max(),
    dev_df['tokenized_sentence2'].dropna().map(lambda x: len(x)).max(),
    test_df['tokenized_sentence1'].dropna().map(lambda x: len(x)).max(),
    test_df['tokenized_sentence2'].dropna().map(lambda x: len(x)).max()
)


print(max_len)
print('zero paddings...')

for i in range(3576):
 X_train['sentence1'][i] = pad_sequences([X_train['sentence1'][i]], maxlen=max_len, padding='post', truncating='post')[0]
 X_train['sentence2'][i] = pad_sequences([X_train['sentence2'][i]], maxlen=max_len, padding='post', truncating='post')[0]
for i in range(500):
  X_val['sentence1'][i] = pad_sequences([X_val['sentence1'][i]], maxlen=max_len, padding='post', truncating='post')[0]
  X_val['sentence2'][i] = pad_sequences([X_val['sentence2'][i]], maxlen=max_len, padding='post', truncating='post')[0]
for i in range(1725):
  X_test['sentence1'][i] = pad_sequences([X_test['sentence1'][i]], maxlen=max_len, padding='post', truncating='post')[0]
  X_test['sentence2'][i] = pad_sequences([X_test['sentence2'][i]], maxlen=max_len, padding='post', truncating='post')[0]

30
zero paddings...


In [None]:
print(X_test['sentence2'][0])

[22611  3187  2814  4030  6009 22612 10256  3187  1409  4030  7963 22613
   299  2362 22610    18     0     0     0     0     0     0     0     0
     0     0     0     0     0     0]


Save file


In [None]:
import pickle
import gzip
import os

In [None]:
# save results to use in different file
def save_as_pickle(data, file_name):
    base_path = '/content/drive/MyDrive/Mini-projects'
    file_path = os.path.join(base_path, file_name)

    with gzip.open(file_path, 'wb') as f:
        pickle.dump(data, f)

In [None]:
col = ['quality', 'id1', 'id2', 'sentence1',
       'sentence2', 'tokenized_sentence1', 'tokenized_sentence2']

save_as_pickle(train_df[col], 'train.pickle')
save_as_pickle(dev_df[col], 'val.pickle')
save_as_pickle(test_df[col], 'test.pickle')

save_as_pickle(embeddings, 'embeddings.pickle')
save_as_pickle(voc_dict, 'voc_dict.pickle')

save train...
save val...
save test...
save embeddings...
save voc_dict...


In [None]:
save_as_pickle(X_train, 'X_train_word.pickle')
save_as_pickle(X_val, 'X_val_word.pickle')
save_as_pickle(X_test, 'X_test_word.pickle')

save_as_pickle(y_train, 'y_train.pickle')
save_as_pickle(y_val, 'y_val.pickle')
save_as_pickle(y_test, 'y_test.pickle')

save_as_pickle(max_len, 'max_len.pickle')

save X_train_word...
save X_val_word...
save X_test_word...
save y_train...
save y_val...
save y_test...
save max_len...
