In [68]:
import os
import sys
import pandas as pd
import json
import re
import emoji
import gensim

module_path = os.path.abspath(os.path.join('../../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from weat_wefat.src.lib import weat
from weat_wefat.src.lib import weat_tables
import importlib
importlib.reload(weat)
importlib.reload(weat_tables)

<module 'weat_wefat.src.lib.weat_tables' from '/Users/adimaini/Documents/GW/Research/CODE.nosync/weat_wefat/src/lib/weat_tables.py'>

In [2]:
TX_PATH = '../../data/us_corpus/texas tweets.jsonl'
NY_PATH = '../../data/us_corpus/NY tweets.jsonl'

In [3]:
# reading in the json files 
tx_data = []
ny_data = []
with open(TX_PATH, 'r') as data:
    for line in data: 
        tx_data.append(json.loads(line))

with open(NY_PATH, 'r') as data:
    for line in data:
        ny_data.append(json.loads(line))

In [4]:
def combined_full_text(json_object):
    corpus = [tweet['full_text'] for tweet in json_object]
    corpus = ' '.join(corpus).lower()
    return corpus

def delete_links(text): 
    return re.sub(r'http\S+', '', text)

def delete_hashtags(text):
    return re.sub(r'#(\w+)', '', text)

def delete_mentions(text):
    return re.sub(r'@(\w+)', '', text)

def delete_emojies(text):
    new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
    return new_text

def delete_new_line_chars(text):
    return " ".join(text.splitlines())

def delete_multiple_spaces(text):
    return " ".join(text.split())

def replace_amp(text):
    return text.replace('&amp', '&')

In [5]:
tx_filtered = combined_full_text(tx_data)
tx_filtered = delete_links(tx_filtered)
tx_filtered = delete_hashtags(tx_filtered)
tx_filtered = delete_mentions(tx_filtered)
tx_filtered = delete_emojies(tx_filtered)
tx_filtered = delete_new_line_chars(tx_filtered)
tx_filtered = delete_multiple_spaces(tx_filtered)
tx_filtered = replace_amp(tx_filtered)

In [6]:
ny_filtered = combined_full_text(ny_data)
ny_filtered = delete_links(ny_filtered)
ny_filtered = delete_hashtags(ny_filtered)
ny_filtered = delete_mentions(ny_filtered)
ny_filtered = delete_emojies(ny_filtered)
ny_filtered = delete_new_line_chars(ny_filtered)
ny_filtered = delete_multiple_spaces(ny_filtered)
ny_filtered = replace_amp(ny_filtered)

In [7]:
def write_filtered_file(filtered_text, state):
    '''write filtered files to disk'''
    with open('../../data/us_corpus/' + state + '_filtered_sentences', 'w') as text_file:
        text_file.write(filtered_text)
    print ('Written successfully.')

In [8]:
write_filtered_file(tx_filtered, 'tx')
write_filtered_file(ny_filtered, 'ny')

Written successfully.
Written successfully.


### 814k words in TX dataset and 835k words in NY dataset

In [9]:
len(tx_filtered.split(' '))

814825

In [10]:
len(ny_filtered.split(' '))

835521

We need filters to get rid of: <br>
1. links - done 
2. emojies - done 
3. new line characters and other characters - done 
4. multiple spaces - done

Create word embeddings. 

Processing steps: 
1. Lowercase all words
2. Filtering out tweets
3. skipgram to create one word out of common 2 or 3 word sets. 

In [11]:
def split_string_to_list_sentences(filtered):
    new_text = [text.split() for text in filtered.split('.')]
    # to delete any empty lists in nested list
    new_text = [text for text in new_text if text != []]
    return new_text

In [12]:
# splitting string to list of lists
tx_filtered_sentences = split_string_to_list_sentences(tx_filtered)
ny_filtered_sentences = split_string_to_list_sentences(ny_filtered)

In [23]:
len(tx_model.wv.vocab)

9993

In [24]:
len(ny_model.wv.vocab)

10661

Need to make sure the vocabulary is the same between the words. There are 9993 words in tx model, and 10661 words in ny model. There are 7822 words that these models share. I expect in larger embeddings, this % would even go up. 

In [42]:
# these are words that are available in both tx_model and ny_model
common_words = [word for word in ny_model.wv.vocab.keys() if word in tx_model.wv.vocab.keys()]

In [29]:
len(common_words)

7822

In [30]:
# how to get vectors
tx_model.wv.get_vector('the')

array([-4.43189532e-01, -3.57736737e-01,  2.92256419e-02, -4.65842456e-01,
       -4.55699563e-01,  2.13127941e-01, -4.35562432e-01, -2.30017945e-01,
       -2.38073066e-01, -1.30285770e-01, -4.27194275e-02,  1.96370352e-02,
       -1.81675449e-01, -7.11368620e-01, -1.09932117e-01,  3.08783501e-01,
        2.36191407e-01,  3.84288758e-01,  4.92314845e-02,  2.14052096e-01,
       -5.03125668e-01, -3.61591995e-01,  5.67432880e-01,  1.01042753e-02,
       -3.98735821e-01, -1.31534874e-01, -2.33123928e-01, -3.87776792e-01,
       -1.26577124e-01,  4.03291792e-01,  6.67448491e-02,  9.89265144e-02,
       -9.27431881e-02,  1.78483441e-01,  6.18473649e-01,  9.40632403e-01,
       -2.27907434e-01,  2.45621607e-01,  5.85861981e-01, -5.04048645e-01,
       -1.91295892e-01, -1.40505508e-01, -1.04945667e-01, -3.72069143e-02,
       -1.47140503e-01, -3.85369837e-01, -3.51273149e-01, -6.17517419e-02,
        6.69078112e-01,  7.42375478e-02,  5.41767776e-01,  3.92344087e-01,
        8.37463140e-02, -

In [40]:
keywords = ['covid', 'gatherings', 'biden', 'trump']

In [56]:
weat_tables.get_weac_scores(tx_model, ny_model, keywords, common_words)

{'covid': (-0.2695125, 0.0),
 'gatherings': (-0.82634616, nan),
 'biden': (0.30987576, 0.15865525393145707),
 'trump': (0.05320857, 0.8413447460685429)}

In [59]:
weat_tables.get_weac_scores(tx_model, ny_model, keywords, common_words)

{'covid': -0.2695125,
 'gatherings': -0.15620407,
 'biden': 0.8273751,
 'trump': 0.10322266}

In [61]:
weat_tables.get_weac_scores(tx_model, ny_model, keywords, common_words)

{'covid': -0.2695125,
 'gatherings': -0.15620407,
 'biden': 0.8273751,
 'trump': 0.10322266}

In [63]:
weat_tables.get_weac_scores(tx_model, ny_model, keywords, common_words)

{'covid': (-0.2695125, 4.360706351569377e-09),
 'gatherings': (-0.82541734, 1.944000516118649e-13),
 'biden': (0.30987576, nan),
 'trump': (0.05320857, nan)}

In [65]:
weat_tables.get_weac_scores(tx_model, ny_model, keywords, common_words)

{'covid': [-0.2695125, 2.6122481955326293e-10],
 'gatherings': [-0.82556057, 0.0],
 'biden': [0.30987576, nan],
 'trump': [0.05320857, nan]}

In [67]:
#1k permutation
weat_tables.get_weac_scores(tx_model, ny_model, keywords, common_words)

{'covid': [-0.2695125, 0.0],
 'gatherings': [-0.82634616, 0.15865525393145707],
 'biden': [0.30987576, 0.8413447460685429],
 'trump': [0.05320857, 0.8413447460685429]}

In [70]:
# with the objects seperated out in get_weac_scores method, with 100 permutations
weat_tables.get_weac_scores(tx_model, ny_model, keywords, common_words)

{'covid': [-0.2695125, 0.0],
 'gatherings': [-0.82634616, nan],
 'biden': [0.30987576, 0.15865525393145707],
 'trump': [0.05320857, 0.8413447460685429]}

In [44]:
import numpy as np

In [47]:
#p-value implementation
np.random.choice([keywords, common_words])

  np.random.choice([keywords, common_words])


['covid', 'gatherings', 'biden', 'trump']

In [50]:
np.array(common_words)

array(['david', 'he’s', 'better!', ..., 'hornets', 'chaotic', 'clots'],
      dtype='<U19')

In [13]:
tx_model = gensim.models.Word2Vec(tx_filtered_sentences, size=300)

In [14]:
ny_model = gensim.models.Word2Vec(ny_filtered_sentences, size=300)

In [124]:
tx_model.wv.save_word2vec_format('../../data/us_corpus/tx_model.bin')
ny_model.wv.save_word2vec_format('../../data/us_corpus/ny_model.bin')

In [129]:
ny_model['covid-19']

  ny_model['covid-19']


array([-2.2915635 ,  0.7186643 , -1.5424136 , -0.04538518, -0.24888055,
       -0.40306202,  0.118222  ,  0.16242756, -0.5333023 ,  0.1405754 ,
       -0.28758627,  0.30425525,  0.17389886,  0.4812034 , -0.5652642 ,
       -0.13374685,  0.04039047, -1.0925348 ,  0.65523934,  0.574105  ,
       -1.3769137 ,  2.074731  , -0.3665593 ,  0.4485164 ,  0.66561246,
       -1.2455931 ,  0.41868168,  0.8937702 ,  0.21526366,  0.48193038,
       -0.635199  ,  1.3892021 , -0.7057326 ,  0.41512582, -0.19966726,
        0.91938317, -0.69361055,  0.2523994 ,  0.33905455, -2.5985143 ,
        0.85690194,  0.5459583 ,  0.34257257,  0.01718077, -0.51665187,
       -0.69949925, -0.14731999, -0.46116993,  0.58202285, -0.23700841,
       -0.7670063 ,  0.08025296,  0.33924827, -0.45737845, -0.7951688 ,
       -0.51602703,  0.5054371 ,  0.53024817, -0.90056646, -0.03129069,
       -0.40291408,  0.43689084, -0.99245226,  1.1860844 , -0.8199448 ,
       -0.35487548, -1.3389012 ,  0.98210317,  0.07150035,  1.58

In [126]:
len(tx_model.wv.vocab)

9993

In [127]:
len(ny_model.wv.vocab)

10661

Running WEAT tests on these two sets of embeddings

In [143]:
df_tx = weat_tables.output_table(model=tx_model,
                                filepath = '../../targets_attributes_data/covid related/covid pleasant unpleasant.csv',
                                algorithm = 'wefat')

Reading files...

Finished.


In [144]:
df_ny = weat_tables.output_table(model=ny_model,
                                filepath = '../../targets_attributes_data/covid related/covid pleasant unpleasant.csv',
                                algorithm = 'wefat')

Reading files...

Finished.


In [145]:
df_tx

Unnamed: 0,Target,Effect Size,P-Value
0,gatherings,-0.2,0.658578
1,mask,0.74,0.071157
2,distancing,-0.11,0.585854
3,restrictions,-0.34,0.752556
4,quarantine,0.14,0.40002
5,parties,-0.23,0.680815
6,covid,-0.59,0.8801
7,vaccine,-0.95,0.970741
8,indoors,0.4,0.220365


In [146]:
df_ny

Unnamed: 0,Target,Effect Size,P-Value
0,gatherings,-0.38,0.772922
1,mask,0.62,0.109307
2,distancing,0.0,0.498338
3,restrictions,-0.56,0.864817
4,quarantine,0.12,0.405224
5,parties,-0.65,0.906243
6,covid,-0.84,0.954958
7,vaccine,-1.22,0.99258
8,indoors,-0.25,0.694034
