In [0]:
import time
import re
import numpy as np
import pandas as pd
from sklearn.neighbors import DistanceMetric
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
drive.mount('/content/drive')
random_state = 7

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#paths to training raw data csv files downloaded from U Rochester's site
#in this case we have: 
#   - a base training data file 'train.csv'
#   - extra training data file 'train_funlines.csv'
dfs_train_path = ["/content/drive/My Drive/ipython notebooks/COSC-572/semeval-2020-task-7-data/data/task-1/train.csv",
            "/content/drive/My Drive/ipython notebooks/COSC-572/semeval-2020-task-7-data/extra-training-data/task-1/train_funlines.csv"]
dfs_train = []

#   - test data file 'truth_task_1.csv'
dfs_test_path  = ["/content/drive/My Drive/ipython notebooks/COSC-572/semeval-2020-task-7-data/test_data/truth_task_1.csv"]
dfs_test = []

for df_path in dfs_train_path:
    df = pd.read_csv(df_path)
    dfs_train.append(df)

for df_path in dfs_test_path:
    df = pd.read_csv(df_path)
    dfs_test.append(df)
    


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#load glove embedding
#This usually takes several minutes
f = open('/content/drive/My Drive/ipython notebooks/Embeddings/GloVe/glove.42B.300d.txt','r')
glove = {}
for line in f:
    splitLine = line.split()
    word = splitLine[0]
    embedding = np.array([float(val) for val in splitLine[1:]])
    glove[word] = embedding

In [0]:
orig_word_re = re.compile(r"(\<)(.*)(\/\>)")
euclidean_distance_measure = DistanceMetric.get_metric('euclidean')

In [0]:
for df in dfs_train + dfs_test:
    df['original_word'] = ''
    orig_word_loc = df.columns.get_loc('original_word')
    df['word_position'] = 0
    word_pos_loc = df.columns.get_loc('word_position')
    df['edited_sentence'] = ''
    edited_loc = df.columns.get_loc('edited_sentence')
    df['orig_sent_no_bracket'] = ''
    o_s_n_c_loc= df.columns.get_loc('orig_sent_no_bracket')
    df['euclidean_dist'] = 0
    euclid_loc = df.columns.get_loc('euclidean_dist')
    df['cosine_similarity'] = 0
    cos_loc = df.columns.get_loc('cosine_similarity')


    for index, row in df.iterrows():
        orig_word, edited_word = re.search(orig_word_re, row['original'])[2], row['edit']

        #add original word
        df.iloc[index, orig_word_loc] = orig_word

        try:
            #This happens if the re <.*/> is in the list of words.
            word_position = list(row['original'].split()).index('<'+orig_word+'/>')
        except ValueError:
            #This happens when you encounter, for example, <Hillary Clinton/s>. 
            #split() will split along [..., '<Hillary', 'Clinton/s'... ]
            #so the expression below in the argument of .index() matches '<Hillary'
            word_position = list(row['original'].split()).index('<'+list(orig_word.split())[0])
        df.iloc[index, word_pos_loc] = word_position

        #add edited sentence
        edited_sentence = re.sub(orig_word_re, row['edit'], row['original'])
        df.iloc[index, edited_loc] = edited_sentence

        #add original sentence no brackets
        orig_sent_no_bracket = re.sub(orig_word_re, orig_word, row['original'])
        df.iloc[index, o_s_n_c_loc] = orig_sent_no_bracket

        #calculate and add distance and cosine similarity
        try:
            words = [glove[orig_word.lower()], glove[edited_word.lower()]]
        #if the word not in the embedding, assign two zeroed-out arrays. I'm not sure about this - we can change later
        except KeyError:
            words = [np.zeros(100), np.zeros(100)]

        euclid_dist = euclidean_distance_measure.pairwise(words)[0,1]
        df.iloc[index, euclid_loc] = euclid_dist
        cos_sim = cosine_similarity(words)[0,1]
        df.iloc[index, cos_loc]= cos_sim



In [0]:
df_train_full = pd.concat(dfs_train)
df_train_full.to_csv('/content/drive/My Drive/ipython notebooks/COSC-572/full_prepped_training.csv', index=False)


In [0]:
dfs_test[0].to_csv('/content/drive/My Drive/ipython notebooks/COSC-572/prepped_test.csv', index=False)