# Procedure followed by my Master's thesis

Following this file, one can recreate my results:
- if one wants to recreate it from scratch, the 1830s section from the COHA is necessary. It must be structured identical to the Example Corpus, the folder called 'COHA'. Delete 'preprocessed_corpus.json', 'preprocessed_filtered_corpus.json' and all non-python files from the 'Embeddings' folder - otherwise the preexisting files will be used.
- if one wants to recreate the computing of the embeddings and anything after, no additional data is needed. Make sure 'preprocessed_corpus.json' and 'preprocessed_filtered_corpus.json' are in place. Delete any non-python files from the 'Embeddings' folder.
- if one only wants to recreate the computing of the historical VAD lexica, no additional data is needed and no original files should be deleted.

Note: if you decide to retrain the embeddings, results for SGNS, CBOW, and FastText will slightly differ due to the non-deterministic nature of these algorithms.

In [1]:
from model_definition import Model
from pathlib import Path
from correlation import calculate_correlation, compare_lexica
import pandas as pd
import random

# Global Variables

In [2]:
list_of_embeddings = ['SGNS', 'CBOW', 'FastText', 'PPMI', 'SVD_PPMI']
list_of_induction_algorithms = ['kNN', 'PaRaSim', 'RandomWalk', 'LinearRegression']

# Section 4.1

# The VAD lexica

For my experiments I need to limit the VAD lexica to the vocab of the COHA and take out the GOLD standard words. In addition, to follow Hellrich et al., I have to get the words from ANEW but the values from Warriner. And as a comparison,
I need to get the words from ANEW and the values from NRC VAD.

# Filtering out Gold Standard Words and Words not in the COHA

In [3]:
gold_standard_path = Path('VADLexica/goldEN.vad')
lemmata_in_COHA_path = Path('Embeddings/AAA_word_list.txt')

anew_path = Path('VADLexica/ANEW')
anew_filtered_path = Path('VADLexica/ANEW_refined_withoutGOLD_limited_to_COHA')

warriner_path = Path('VADLexica/Warriner_et_alemotratings.csv')
warriner_filtered_path = Path('VADLexica/Warriner_refined_withoutGOLD_limited_to_COHA')

nrc_vad_path = Path('VADLexica/NRC-VAD-Lexicon.txt')
nrc_vad_filtered_path = Path('VADLexica/NRC_VAD_refined_withoutGOLD_limited_to_COHA')

warriner_path_final_version = Path('VADLexica/Warriner_ANEW_refined_withoutGOLD_limited_to_COHA')
nrc_vad_path_final_version = Path('VADLexica/NRC_VAD_ANEW_refined_withoutGOLD_limited_to_COHA')

In [4]:
GOLDSTANDARD = []
with open(gold_standard_path) as f:
    for line in f:
        GOLDSTANDARD.append(line.split()[0])
Lemmata_in_COHA = []
with open(lemmata_in_COHA_path) as f:
    for line in f:
        Lemmata_in_COHA.append(line.strip())

Anew = []
with open(anew_path, mode = 'r') as f:
    for line in f:
        words = line.split()
        Anew.append([words[0], words[2], words[4], words[6]])
with open(anew_filtered_path, mode = 'w') as f:
    for row in Anew:
        if row[0] in Lemmata_in_COHA and row[0] not in GOLDSTANDARD:
            line = '\t'.join(map(str, row))
            f.write(line + '\n')

Warriner = []
with open(warriner_path, mode = 'r') as f:
    for line in f:
        words = line.split(',')
        Warriner.append([words[1], words[2], words[5], words[8]])
with open(warriner_filtered_path, mode = 'w') as f:
    for row in Warriner:
        if row[0] in Lemmata_in_COHA and row[0] not in GOLDSTANDARD:
            line = '\t'.join(map(str, row))
            f.write(line + '\n')

NRC_VAD = []
with open(nrc_vad_path, mode = 'r') as f:
    for line in f:
        words = line.split('\t')
        NRC_VAD.append([words[0], round(float(words[1])*8+1, 3), round(float(words[2])*8+1,3), round(float(words[3])*8+1, 3)])
with open(nrc_vad_filtered_path, mode = 'w') as f:
    for row in NRC_VAD:
        if row[0] in Lemmata_in_COHA and row[0] not in GOLDSTANDARD:
            line = '\t'.join(map(str, row))
            f.write(line + '\n')


# Set to ANEW

In [6]:
Anew = []
Warriner = []
NRC_VAD = []

with open(anew_filtered_path) as f:
    for line in f:
        Anew.append(line.split('\t')[0])
with open(warriner_filtered_path) as f:
    for line in f:
        Warriner.append(line.split('\t'))
with open(nrc_vad_filtered_path) as f:
    for line in f:
        NRC_VAD.append(line.split('\t'))

with open(warriner_path_final_version, mode = 'w') as f:
    for row in Warriner:
        if row[0] in Anew:
            line = '\t'.join(map(str, row))
            f.write(line)
with open(nrc_vad_path_final_version, mode = 'w') as f:
    for row in NRC_VAD:
        if row[0] in Anew:
            line = '\t'.join(map(str, row))
            f.write(line)

# The Preprocessing

As the preprocessed corpus is saved, it is sufficient to call the preprocessing method once (best with 'PPMI' or 'SVD_PPMI' to obtain both the filtered and unfiltered version). After that, every new Model object will load the correct corpus without having to preprocess it again.

In [None]:
model = Model('SVD_PPMI', 'kNN', warriner_path_final_version, 'Warriner') #kNN and Warriner were chosen arbitrarily 
model.preprocess_corpus()
print(f"Is the corpus preprocessed now? {model.corpus_is_preprocessed}")

# The Training of the Embeddings

Similarly, each embedding has to be trained only once. Afterward, any new model will load the correct embedding.

In [None]:
for embedding in list_of_embeddings:
    model = Model(embedding, 'kNN', warriner_path_final_version, 'Warriner') #kNN and Warriner were chosen arbitrarily 
    model.train_embedding()
    print(model.embedding_is_computed)

# Sections 4.2 and 4.3

# Computing all the different historical VAD lexica

In [5]:
list_of_lexicon_paths_and_names = [(warriner_path_final_version, 'Warriner' ), (nrc_vad_path_final_version, 'NRC_VAD')]

In [None]:
for embedding in list_of_embeddings:
    for induction_algorithm in list_of_induction_algorithms:
        for lexicon_path_and_name in list_of_lexicon_paths_and_names:
            print(embedding)
            print(induction_algorithm)
            print(lexicon_path_and_name[1])
            model = Model(embedding, induction_algorithm, *lexicon_path_and_name)
            model.induce_historical_vad_lexicon()

# Computing Correlation for the historical VAD lexica 

In [6]:
column_names = ['r_V', 'p_value_V', 'r_A', 'p_value_A', 'r_D', 'p_value_D', 'r_mean']
index_names = ['kNN CBOW', 'kNN FastText', 'kNN PPMI', 'kNN SGNS', 'kNN SVD_PPMI',
               'PaRaSim CBOW', 'PaRaSim FastText', 'PaRaSim PPMI', 'PaRaSim SGNS', 'PaRaSim SVD_PPMI',
               'RandomWalk CBOW', 'RandomWalk FastText', 'RandomWalk PPMI', 'RandomWalk SGNS', 'RandomWalk SVD_PPMI',
               'LinReg CBOW', 'LinReg FastText', 'LinReg PPMI', 'LinReg SGNS', 'LinReg SVD_PPMI']

In [7]:
warriner_files= []
for induction_algorithm in list_of_induction_algorithms:
    folder = Path('HistoricalVAD/'+ induction_algorithm)
    warriner_files.extend(list(folder.glob('Warriner*')))
warriner_correlation = [calculate_correlation(gold_standard_path, path) for path in warriner_files]
warriner_correlation_df = pd.DataFrame(warriner_correlation, columns=column_names, index=index_names)

In [8]:
nrc_vad_files= []
for induction_algorithm in list_of_induction_algorithms:
    folder = Path('HistoricalVAD/'+ induction_algorithm)
    nrc_vad_files.extend(list(folder.glob('NRC_VAD*')))
nrc_vad_correlation = [calculate_correlation(gold_standard_path, path) for path in nrc_vad_files]
nrc_vad_correlation_df = pd.DataFrame(nrc_vad_correlation, columns=column_names, index=index_names)


# Results for Warriner based lexica (Section 4.2)

In [9]:
warriner_correlation_df

Unnamed: 0,r_V,p_value_V,r_A,p_value_A,r_D,p_value_D,r_mean
kNN CBOW,0.4706,0.0,0.3479,0.0004,0.1688,0.0933,0.3291
kNN FastText,0.5391,0.0,0.373,0.0001,0.243,0.0149,0.385
kNN PPMI,0.4543,0.0,0.3223,0.0011,0.2342,0.019,0.337
kNN SGNS,0.4786,0.0,0.4154,0.0,0.1991,0.0471,0.3643
kNN SVD_PPMI,0.4227,0.0,0.3267,0.0009,0.1498,0.1369,0.2997
PaRaSim CBOW,0.2511,0.0117,0.1908,0.0572,0.2017,0.0442,0.2145
PaRaSim FastText,0.4791,0.0,0.3826,0.0001,0.1739,0.0836,0.3452
PaRaSim PPMI,0.4743,0.0,0.384,0.0001,0.1835,0.0676,0.3473
PaRaSim SGNS,0.5223,0.0,0.4035,0.0,0.202,0.0439,0.3759
PaRaSim SVD_PPMI,0.1236,0.2203,-0.0542,0.5921,0.0942,0.3512,0.0545


# Results for NRC-VAD based lexica (Section 4.3)

In [10]:
nrc_vad_correlation_df

Unnamed: 0,r_V,p_value_V,r_A,p_value_A,r_D,p_value_D,r_mean
kNN CBOW,0.475,0.0,0.3612,0.0002,0.0741,0.4637,0.3034
kNN FastText,0.4871,0.0,0.4641,0.0,0.0332,0.7433,0.3281
kNN PPMI,0.511,0.0,0.3832,0.0001,-0.0044,0.9657,0.2966
kNN SGNS,0.5225,0.0,0.3978,0.0,0.1122,0.2663,0.3442
kNN SVD_PPMI,0.4625,0.0,0.4182,0.0,0.0113,0.9114,0.2973
PaRaSim CBOW,0.2191,0.0285,0.2202,0.0277,0.0381,0.7067,0.1591
PaRaSim FastText,0.4794,0.0,0.401,0.0,0.0608,0.5482,0.3137
PaRaSim PPMI,0.4936,0.0,0.3831,0.0001,0.0422,0.677,0.3063
PaRaSim SGNS,0.5397,0.0,0.4029,0.0,0.0978,0.3332,0.3468
PaRaSim SVD_PPMI,0.0379,0.7084,-0.0762,0.4512,-0.0087,0.9318,-0.0157


# Section 4.4

Here the seed word lexica of increasing size are generated (with random subsamples, see sections 4.1 and 4.4 in the MA thesis)

In [34]:
def get_sub_lexica(n_of_w, lexicon_path, lexicon_name):
    with open(lexicon_path) as f:
        doc = f.readlines()
        range_int = len(doc)

    base_lexicon = []
    with open(lexicon_path) as f:
        for l in f:
            l = l.split('\t')
            base_lexicon.append([l[0], float(l[1]), float(l[2]), float(l[3])])

    random.seed(42)
    for i in range(50):
        list1 = random.sample(range(0, range_int), k=n_of_w)
        result_lexicon = [base_lexicon[i] for i in list1]
        
        save_dir = Path('VADLexica/'+lexicon_name+str(n_of_w)+'/')
        save_dir.mkdir(parents=True, exist_ok=True)
        
        save_path = save_dir / f'{lexicon_name}_{n_of_w}_{i+1}'
        with open(save_path,mode='w') as f:
            for row in result_lexicon:
                l = '\t'.join(map(str, row))
                f.write(l + '\n')

We need the Warriner/NRC-VAD lexica that only have COHA words, but no gold standard words. We computed them above, see Section 4.1.

In [35]:
# for warriner_filtered_path ('VADLexica/Warriner_refined_withoutGOLD_limited_to_COHA')
for i in [2000, 3000, 4000, 5000, 6000, 7000, 8000]:
    get_sub_lexica(i, warriner_filtered_path, 'Warriner')

In [36]:
# nrc_vad_filtered_path ('VADLexica/NRC_VAD_refined_withoutGOLD_limited_to_COHA')
for i in [2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]:
    get_sub_lexica(i, nrc_vad_filtered_path, 'NRC_VAD')

Now we need to compute all historical VAD lexica. We only do this with the three models that performed best in Sections 4.2 and 4.3.

In [13]:
best_models_warriner = [('FastText', 'kNN'), ('FastText', 'LinearRegression'), ('SGNS', 'LinearRegression')]
best_models_nrc_vad = [('FastText', 'LinearRegression'),('SGNS', 'LinearRegression'), ('SGNS', 'PaRaSim')]

In [9]:
def compute_all(lexicon_name, best_models):
    lexicon_folder_level = []
    folder = Path('VADLexica/')
    lexicon_folder_level.extend(sorted(list(folder.glob(f'{lexicon_name}[0-9]*'))))
    print(lexicon_folder_level)
    
    for lexicon_folder in lexicon_folder_level:
        print(lexicon_folder)
        lexicon_level = []
        lexicon_level.extend(sorted(list(lexicon_folder.glob(f'{lexicon_name}*'))))
        for path in lexicon_level:
            name = str(path.parts[2])
            for m in best_models:
                model = Model(*m, path, f'{name}')
                save_path = f"{m[0]}_{m[1]}/{path.parts[1]}"
                print(save_path)
                print(name)
                model.induce_historical_vad_lexicon(save_path)

In [None]:
compute_all('Warriner', best_models_warriner)

In [None]:
compute_all('NRC_VAD', best_models_nrc_vad)

 Let us compute all Correlations and the averages of the Correlations

In [14]:
def compare_all(lexicon_name, best_models):
    results = []
    for m in best_models:
        lexicon_folder_level = []
        folder = Path(f'HistoricalVAD/{m[0]}_{m[1]}/')
        lexicon_folder_level.extend(sorted(list(folder.glob(f'{lexicon_name}[0-9]*'))))
        
        for lexicon_folder in lexicon_folder_level:
            results.append(compare_lexica(lexicon_folder, gold_standard_path))
    return results

In [15]:
results_warriner = compare_all('Warriner', best_models_warriner)

index_names_warriner = [best_models_warriner[0][0]+best_models_warriner[0][1]+str(i*1000) for i in range(2,9)] + [best_models_warriner[1][0]+best_models_warriner[1][1]+str(i*1000) for i in range(2,9)] + [best_models_warriner[2][0]+best_models_warriner[2][1]+str(i*1000) for i in range(2,9)]

results_warriner_df = pd.DataFrame(results_warriner, columns=column_names, index=index_names_warriner) 
results_warriner_df

Unnamed: 0,r_V,p_value_V,r_A,p_value_A,r_D,p_value_D,r_mean
FastTextkNN2000,0.43312,0.000208,0.318086,0.008218,0.168552,0.166494,0.30659
FastTextkNN3000,0.448616,4e-05,0.351348,0.002086,0.191316,0.098672,0.330424
FastTextkNN4000,0.46701,4e-06,0.362154,0.000976,0.208002,0.058976,0.34573
FastTextkNN5000,0.495046,0.0,0.391458,0.000146,0.238648,0.027396,0.375044
FastTextkNN6000,0.490898,0.0,0.41985,2.2e-05,0.26094,0.015552,0.39056
FastTextkNN7000,0.48506,0.0,0.43495,6e-06,0.265996,0.009784,0.395336
FastTextkNN8000,0.483036,0.0,0.440278,0.0,0.251774,0.01252,0.391698
FastTextLinearRegression2000,0.493096,0.0,0.378768,0.000968,0.229132,0.041308,0.367
FastTextLinearRegression3000,0.5185,0.0,0.419766,4.8e-05,0.241226,0.020514,0.393178
FastTextLinearRegression4000,0.528978,0.0,0.427164,2.2e-05,0.239472,0.019338,0.398538


In [16]:
results_warriner_df.iloc[:7].to_csv(Path('HistoricalVAD/Correlations/Summary_FastText_kNN_Warriner.csv'))
results_warriner_df.iloc[7:14].to_csv(Path('HistoricalVAD/Correlations/Summary_FastText_LinReg_Warriner.csv')) 
results_warriner_df.iloc[14:].to_csv(Path('HistoricalVAD/Correlations/Summary_SGNS_LinReg_Warriner.csv')) 

In [17]:
results_nrc_vad = compare_all('NRC_VAD', best_models_nrc_vad)

index_names_nrc_vad_0 = [best_models_nrc_vad[0][0]+best_models_nrc_vad[0][1]+'10000']+[best_models_nrc_vad[0][0]+best_models_nrc_vad[0][1]+'0'+str(i*1000) for i in range(2,10)]
index_names_nrc_vad_1 = [best_models_nrc_vad[1][0]+best_models_nrc_vad[1][1]+'10000']+[best_models_nrc_vad[1][0]+best_models_nrc_vad[1][1]+'0'+str(i*1000) for i in range(2,10)] 
index_names_nrc_vad_2 = [best_models_nrc_vad[2][0]+best_models_nrc_vad[2][1]+'10000']+[best_models_nrc_vad[2][0]+best_models_nrc_vad[2][1]+'0'+str(i*1000) for i in range(2,10)] 

index_names_nrc_vad = index_names_nrc_vad_0 + index_names_nrc_vad_1 + index_names_nrc_vad_2

results_nrc_vad_df = pd.DataFrame(results_nrc_vad, columns=column_names, index=index_names_nrc_vad) 
results_nrc_vad_df 

Unnamed: 0,r_V,p_value_V,r_A,p_value_A,r_D,p_value_D,r_mean
FastTextLinearRegression10000,0.579294,0.0,0.452906,0.0,0.225348,0.024298,0.41918
FastTextLinearRegression02000,0.53551,0.0,0.41092,0.000116,0.208544,0.048144,0.384994
FastTextLinearRegression03000,0.549906,0.0,0.42273,3.2e-05,0.211336,0.041018,0.39466
FastTextLinearRegression04000,0.562234,0.0,0.432832,6e-06,0.215132,0.036008,0.403402
FastTextLinearRegression05000,0.56917,0.0,0.440802,4e-06,0.22181,0.029446,0.410592
FastTextLinearRegression06000,0.578146,0.0,0.449608,0.0,0.223628,0.026608,0.417118
FastTextLinearRegression07000,0.574346,0.0,0.446344,0.0,0.222234,0.027424,0.414302
FastTextLinearRegression08000,0.576388,0.0,0.448926,0.0,0.22265,0.02672,0.415984
FastTextLinearRegression09000,0.579076,0.0,0.44996,0.0,0.223534,0.026034,0.417524
SGNSLinearRegression10000,0.57453,0.0,0.494756,0.0,0.20691,0.039446,0.4254


In [26]:
results_nrc_vad_df.iloc[:9].to_csv(Path('HistoricalVAD/Correlations/Summary_FastText_LinReg_NRC_VAD.csv'))
results_nrc_vad_df.iloc[9:18].to_csv(Path('HistoricalVAD/Correlations/Summary_SGNS_LinReg_NRC_VAD.csv')) 
results_nrc_vad_df.iloc[18:].to_csv(Path('HistoricalVAD/Correlations/Summary_SGNS_PaRaSim_NRC_VAD.csv')) 