In [1]:
import re
import sys
import nltk
import json
import warnings
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [11]:
# Load all the data
trainpath = '../../train.csv'
testpath = '../../test.csv'
valpath = '../../val.csv'

traindata = pd.read_csv(trainpath)
testdata = pd.read_csv(testpath)
valdata = pd.read_csv(valpath)

traindata = traindata[['id', 's1', 's2', 'score']]
testdata = testdata[['id', 's1', 's2', 'score']]
valdata = valdata[['id', 's1', 's2', 'score']]

In [12]:
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_number(text):
    return re.sub(r'\d+', 'num', text)

def replace_url(text):
    return re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', 'url', text)

def replace_hashtags(text):
    return re.sub(r'#[a-zA-Z\d]+', 'hashtag', text)

def replace_email(text):
    return re.sub(r'[a-zA-Z\.]+@[a-zA-Z\.\d]+', 'email', text)

def replace_mentions(text):
    return re.sub(r'@[a-zA-Z\.\d_]+', 'mention', text)

In [13]:
# Stop words, Lemmatization and Stemming
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [14]:
def preprocess_text(text):
    text = remove_punctuation(text)
    text = remove_number(text)
    text = replace_url(text)
    text = replace_hashtags(text)
    text = replace_email(text)
    text = replace_mentions(text)
    # convert to lower case
    text = text.lower()
    sentence = text.split()
    # remove stop words
    sentence = [word for word in sentence if word not in stop_words]
    # apply lemmatize
    sentence = [lemmatizer.lemmatize(word) for word in sentence]
    # apply stemming
    sentence = [stemmer.stem(word) for word in sentence]
    return sentence

def process_data(data):
    data['s1'] = data['s1'].apply(lambda x: preprocess_text(x))
    data['s2'] = data['s2'].apply(lambda x: preprocess_text(x))
    return data
    
def get_vocab(data):
    vocab = set()
    for _, row in data.iterrows():
        for word in row['s1']:
            vocab.add(word)
        for word in row['s2']:
            vocab.add(word)
    return vocab

In [15]:
traindata = process_data(traindata)
testdata = process_data(testdata)
valdata = process_data(valdata)

# Create vocab and word2idx
vocab = get_vocab(traindata)
word2idx = {word: idx for idx, word in enumerate(sorted(vocab))}

file_path = 'word2idx.json'
file_path = 'data/word2idx.json'

# # Save the dictionary to a JSON file
with open(file_path, 'w') as file:
    json.dump(word2idx, file)

In [16]:
# Store the processed datasets
traindata.to_csv('data/train.csv', index=False)
testdata.to_csv('data/test.csv', index=False)
valdata.to_csv('data/val.csv', index=False)

In [17]:
print(len(vocab))

8597


In [18]:
print(traindata[:5])

                id                                                 s1  \
0   sts_train_4024            [sudan, block, youtub, antiislam, film]   
1   sts_train_1409                           [man, ride, white, hors]   
2   sts_train_3397  [mr, mors, charg, assault, mr, darvish, charg,...   
3   sts_train_1532                    [girl, play, pile, color, ball]   
4  sick_train_1874           [person, black, jacket, trick, motorbik]   

                                                  s2  score  
0  [pakistan, pm, order, youtub, halt, antiislam,...   0.56  
1                         [woman, lead, white, hors]   0.36  
2  [partner, bijan, darvish, charg, file, fals, p...   0.55  
3              [littl, girl, play, pit, color, ball]   1.00  
4              [man, black, jacket, trick, motorbik]   0.98  
