In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Step 1: Merge surprisal from different models into one single file

In [2]:
# read in surprisal files
df_lstm = pd.read_csv('surprisal.lstm.test_set.csv')
df_srn = pd.read_csv('surprisal.srn.test_set.csv')
print(len(df_srn))

4876


### Surprisal file building
This file is expected to save surprisal from all trained models. 

Columns of the file are supposed to be: **word information** (i.e., word, sentence number, and word position), **model** type (SRN/LSTM), training **step** (aka 9 different corpus sizes), the corresponding **surprisal** of the word which a model type with a specific amount of training produces, and **mean word frequency**.

First, columns of **word**, **model+step**, and **surprisal** are built.

In [4]:
# the cell is for SRN specially
colnames_srn = df_srn.columns.tolist()[3:]
df_melted_srn = pd.melt(df_srn, id_vars=["word"], value_vars=colnames_srn, var_name='model_step', value_name='surprisal')
df_melted_srn.head()

Unnamed: 0,word,model_step,surprisal
0,anne,SRN_1_1000,9.825153
1,lost,SRN_1_1000,10.303036
2,control,SRN_1_1000,10.17767
3,and,SRN_1_1000,4.205298
4,laughed,SRN_1_1000,10.960527


In [5]:
# LSTM
colnames_lstm = df_lstm.columns.tolist()[3:]
df_melted_lstm = pd.melt(df_lstm, id_vars=['word'], value_vars=colnames_lstm, var_name='model_step', value_name='surprisal')

Second, **model+step** is split into two columns **model** and **step**. Sentence number (**sent_nr**), word position in sentence (**word_pos**), and **word** are added as well.

In [6]:
# the information of each word (sent_nr, word_pos) in the test items is read and recorded
# Note that word information is identical across models, so here word info from only the original file that contains surprisal from all 
# lstm models is read.
df_wordInfo = df_lstm[["sent_nr", "word_pos", "word"]]

In [13]:
# build a new dataframe that saves all the columns specified
df_new = pd.DataFrame(columns=["sent_nr","word_pos","word","model","step","surprisal"])

In [14]:
# save SRN-generated surprisals, with word info, model and step
for i in range(len(df_melted_srn)):
    word = df_melted_srn["word"][i]
    model_step = df_melted_srn["model_step"][i]
    info = model_step.split("_")
    model = info[0]
    step = info[2]
    surprisal = df_melted_srn["surprisal"][i]
    
    # find the basic info
    ind = i % 4876
    sent_nr = df_wordInfo["sent_nr"][ind]
    word_pos = df_wordInfo["word_pos"][ind]
    df_new.loc[i] = [sent_nr,word_pos,word,model,step,surprisal]

In [15]:
# result (SRN)
len(df_new)

43884

In [16]:
# add lstm surprisals into the dataframe as well
for i in range(len(df_melted_lstm)):
    word = df_melted_lstm["word"][i]
    model_step = df_melted_lstm["model_step"][i]
    info = model_step.split("_")
    model = info[0]
    step = info[2]
    surprisal = df_melted_lstm["surprisal"][i]
    
    # find the basic info
    ind = i % 4876
    sent_nr = df_wordInfo["sent_nr"][ind]
    word_pos = df_wordInfo["word_pos"][ind]
    df_new.loc[i+43884] = [sent_nr,word_pos,word,model,step,surprisal]

In [17]:
df_new.tail() # now surprisal from all models included in the dataframe

Unnamed: 0,sent_nr,word_pos,word,model,step,surprisal
87763,328,34,getting,LSTM,8773568,6.256627
87764,328,35,covered,LSTM,8773568,10.130459
87765,328,36,in,LSTM,8773568,2.114593
87766,328,37,oil,LSTM,8773568,9.603283
87767,328,38,.,LSTM,8773568,0.692924


Now, given that I would also like to use mean word frequency later (that is, include **word_freq** as a control variable in LMER), frequency of each word is added as a separate column by the code below.

In [18]:
# get mean frequency of each word in the test set
df_freq = pd.read_csv('C:\\Users\\rondin\\Desktop\\Courses\\Computational Psycholinguistics\\Assignment\\Assignment 2\\encow14ax.freq0_w\\encow14ax.freq0_w.tsv', sep='\t')

In [19]:
df_freq.tail() # example content in the frequency file; f_logpermil+3.98131241406 was picked as word_freq

Unnamed: 0,f_raw,rank_abs,f_permil,f_logpermil+3.98131241406,f_logpermil+10,band,token...
19422964,1,35416,0.000104,0.0,6.018688,29,^×
19422965,1,35416,0.000104,0.0,6.018688,29,^~^~^~^~
19422966,1,35416,0.000104,0.0,6.018688,29,^~^~^
19422967,1,35416,0.000104,0.0,6.018688,29,^~^~
19422968,1,35416,0.000104,0.0,6.018688,29,^^$£££


In [20]:
# add word_freq as a column and form a new dataframe

df_all = pd.DataFrame(columns=["sent_nr","word_pos","word","model","step","surprisal","word_freq"])

tokens = []
for i in range(len(df_lstm)):
    token = df_new["word"][i]
    #print(i) # printed i as output to inspect the progress...
    if token in tokens:
        continue
    else:
        rows_token = df_new[(df_new["word"] == token)]
        
        token_depunc = token.replace("'","")
        freq = df_freq[(df_freq["token..."] == token_depunc)]["f_logpermil+3.98131241406"].values[0] # chose log10freq+realnum
        rows_token["word_freq"] = freq
        
        df_all = pd.concat([df_all, rows_token])
        
        tokens.append(token)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_token["word_freq"] = freq


In [21]:
# new dataframe visualisation
df_all.tail()

Unnamed: 0,sent_nr,word_pos,word,model,step,surprisal,word_freq
68262,328,37,oil,LSTM,100000,9.130299,5.992917
73138,328,37,oil,LSTM,300000,8.398074,5.992917
78014,328,37,oil,LSTM,1000000,9.375603,5.992917
82890,328,37,oil,LSTM,3000000,9.936421,5.992917
87766,328,37,oil,LSTM,8773568,9.603283,5.992917


In [22]:
df_all = df_all.sort_index()

In [23]:
# save the new dataframe as a file
df_all.to_csv("surprisal_all.csv") 

## Step 2: LMER datafile creation (a separate file for each of the 2\*9=18 models)

In this step I integrate surprisal (and frequency) with the self-paced reading dataset. Word surprisal and frequency is matched with the dataset by sentence number and word position. I create a datafile for each of the 18 models separately.

Note that apart from surprisal of a word, surprisal of the preceding two words are also added to control for the spillover effect in the self-paced reading data. Word length is also calculated and added as a column Each produced datafile thus includes columns of:

"**subj_nr**","**sent_nr**","**sent_pos**","correct","answer_time","**word_pos**","**word**","**RT**" (which are available already in the dataset)

and 

"model","step","**surprisal**","**surprisal_-1**","**surprisal_-2**","**word_freq**","**wordLen**" (which are additionally added to the datafile)

First, as _sent_nr_ in **surprisal_all.csv** is not the original number of sentence stimuli but rather was created by 
_get_surp.py_, I'll have to first add the real stimulus index of each sentence 
(as **sent_ind**) to enable further integration.

In [24]:
baseloc = "C:\\Users\\rondin\\Desktop\\Courses\\Computational Psycholinguistics\\Assignment\\Assignment 2"

In [25]:
# read in the surprisal file created
file = baseloc + "\\surprisal_all.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,sent_nr,word_pos,word,model,step,surprisal,word_freq
0,0,0,0,anne,SRN,1000,9.825153,3.034628
1,1,0,1,lost,SRN,1000,10.303036,6.128433
2,2,0,2,control,SRN,1000,10.17767,6.317902
3,3,0,3,and,SRN,1000,4.205298,8.363556
4,4,0,4,laughed,SRN,1000,10.960527,5.005579


In [26]:
# read in sentence stimulus indices saved beforehand
indices = baseloc + "\\items\\test_set_indices.csv"
df_indices = pd.read_csv(indices)
# example visualisation
df_indices[df_indices["sent_nr_real"] == 51]

Unnamed: 0,sentence,sent_nr_real,sent_nr
48,"powerful in the wrong hands , is that .",51,48


In [29]:
# add read-in indcies to the datafranme
sent_ind = []
for i in range(len(df)):
    sent_nr = df["sent_nr"][i]
    sent_nr_real = df_indices["sent_nr_real"][sent_nr]
    #print(sent_nr_real)
    sent_ind.append(sent_nr_real)

df["sent_ind"] = sent_ind
df.head()

Unnamed: 0.1,Unnamed: 0,sent_nr,word_pos,word,model,step,surprisal,word_freq,sent_ind
0,0,0,0,anne,SRN,1000,9.825153,3.034628,1
1,1,0,1,lost,SRN,1000,10.303036,6.128433,1
2,2,0,2,control,SRN,1000,10.17767,6.317902,1
3,3,0,3,and,SRN,1000,4.205298,8.363556,1
4,4,0,4,laughed,SRN,1000,10.960527,5.005579,1


In [47]:
df.iloc[1:5]["step"]

1    1000
2    1000
3    1000
4    1000
Name: step, dtype: int64

Second, the dataset is read in. Word surprisal (including those of the preceding two words) and frequency are then integrated wih the dataset.

In [32]:
# read in the dataset
SPR = baseloc + "\\Part 1\\selfpacedreading.RT.csv"

df_rt = pd.read_csv(SPR)
df_rt.head()

Unnamed: 0,subj_nr,sent_nr,sent_pos,correct,answer_time,word_pos,word,RT
0,1,2,12,c,3630.0,1,Billy,376
1,1,2,12,c,3630.0,2,wrote,364
2,1,2,12,c,3630.0,3,on,394
3,1,2,12,c,3630.0,4,the,353
4,1,2,12,c,3630.0,5,envelope.,354


In [54]:
# a function to integrate all columns together in a single dataframe

def gen_datafile(model, step):
    df_ms = pd.DataFrame(columns=["subj_nr","sent_nr","sent_pos","correct","answer_time","word_pos","word","RT","model","step","surprisal","surprisal_-1","surprisal_-2","word_freq","wordLen"])
    
    df_surp = df[(df["model"] == model) & (df["step"] == step)]
    df_surp = df_surp.reset_index(drop=True)
    
    i = 0
    print(df_surp.head())
    sent_prev = df_surp["sent_ind"][i]
    punc_num = 0
    while i < len(df_surp):
        sent_nr = df_surp["sent_ind"][i]
        if sent_prev != sent_nr:
            punc_num = 0
            sent_prev = sent_nr
        #print(sent_nr)
        word_pos = df_surp["word_pos"][i] 
        word_pos_rt = word_pos + 1 - punc_num
        #print(word_pos)
        #print(punc_num)
        word = df_surp["word"][i]
        #print(word)

        i += 1
        if word in string.punctuation:
            continue
        else:
            df_word = df_rt[(df_rt["sent_nr"] == sent_nr) & (df_rt["word_pos"] == word_pos_rt)] # each single word from all participants
            #print(df_word.head())
            word_freq = df_surp[(df_surp["sent_ind"] == sent_nr) & (df_surp["word_pos"] == word_pos)]["word_freq"].to_list()[0]
            word_in_rt = df_word["word"].to_list()[0]
            #print(word_in_rt)
            surp0 = df_surp[(df_surp["sent_ind"] == sent_nr) & (df_surp["word_pos"] == word_pos)]["surprisal"].to_list()[0]
            surp_1 = df_surp[(df_surp["sent_ind"] == sent_nr) & (df_surp["word_pos"] == word_pos-1)]["surprisal"].to_list()
            surp_2 = df_surp[(df_surp["sent_ind"] == sent_nr) & (df_surp["word_pos"] == word_pos-2)]["surprisal"].to_list()
            surp_m1 = np.NaN
            surp_m2 = np.NaN
            if len(surp_1) != 0:
                surp_m1 = surp_1[0]
            if len(surp_2) != 0:
                surp_m2 = surp_2[0]
            if word_in_rt[-1] in string.punctuation: # when it comes to the last word, must skip puncs in the next step
                surp0 = np.NaN
                punc_num += 1

            df_word["model"] = model
            df_word["step"] = step
            df_word["surprisal"] = surp0
            df_word["surprisal_-1"] = surp_m1
            df_word["surprisal_-2"] = surp_m2
            df_word["word_freq"] = word_freq
            df_word["wordLen"] = len(word)

            df_ms = pd.concat([df_ms, df_word])
    return df_ms

In [55]:
import string

In [None]:
# use a for-loop to generate datafiles of all models in an automatic manner
models = ["LSTM","SRN"]
steps = [1000,3000,10000,30000,100000,300000,1000000,3000000,8773568]

for model in models:
    for step in steps:
        df_new = gen_datafile(model, step)
        df_new = df_new.sort_index()
        
        data = df_new.drop(df_new[(np.isnan(df_new.surprisal) == 1) | (np.isnan(df_new["surprisal_-1"]) == 1) | (np.isnan(df_new["surprisal_-2"]) == 1)].index)
        data = data.sort_index()
        data.to_csv('data_{0}_{1}.csv'.format(model, step))