## Getting frequency distribution of words in each COP and FFF dataset


In [1]:
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('punkt')

import pandas as pd
import numpy as np
import pickle    
import nltk
from nltk.tokenize import TweetTokenizer
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from scipy.spatial.distance import jensenshannon
from numpy import asarray

In [2]:
path = "/Users/ipinni/Library/CloudStorage/OneDrive-UniversityofLeeds/UKRI_Tweet_Data/completed/"

First get the documents, word frequencies and lemmas from each dataset

In [3]:
def preprocess(documents):
    tknzr = TweetTokenizer(preserve_case=False)
    words = tknzr.tokenize_sents(documents)
    words2 = list(np.concatenate(words).flat)
    #remove stopwords
    stop_wordsNltk = stopwords.words('english')
    my_stopwords = list(["rt","RT", "&", "amp", "&amp", "http","https", "http://", "https://", "fav", "FAV"])
    new_stops = my_stopwords + stop_wordsNltk
    filtered_words = [word for word in words2 if word not in new_stops]
    #remove punctuation
    punc = list(string.punctuation) + [' ','’','“', '”', '...']
    clean = [word for word in filtered_words if word not in punc]
    #lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(word) for word in clean]
    #get frequencies
    freqs=nltk.FreqDist(lemmas)

    return lemmas, freqs

In [4]:
def get_data(version):
    with open(path + version + "/" + version + "docs.list", 'rb') as docs_list_file:
        docs = pickle.load(docs_list_file)

    lemmas, freqs = preprocess(docs)

    return(lemmas, freqs)   

In [5]:
cop20Lemmas, cop20Freqs = get_data("COP20")


In [6]:
cop21Lemmas, cop21Freqs = get_data("COP21")
cop22Lemmas, cop22Freqs = get_data("COP22")
cop23Lemmas, cop23Freqs = get_data("COP23")
cop24Lemmas, cop24Freqs = get_data("COP24")
cop25Lemmas, cop25Freqs = get_data("COP25")
cop26Lemmas, cop26Freqs = get_data("COP26")
FFF2018Lemmas, FFF2018Freqs = get_data("FFF2018")
FFF2019Lemmas, FFF2019Freqs = get_data("FFF2019")
FFF2020Lemmas, FFF2020Freqs = get_data("FFF2020")
FFF2021Lemmas, FFF2021Freqs = get_data("FFF2021")

Create shared vocabulary from unique words in each dataset

In [7]:
COP_word_set = set(cop20Freqs.keys()) | set(cop21Freqs.keys()) | set(cop22Freqs.keys()) | set(cop23Freqs.keys()) | set(cop24Freqs.keys()) | set(cop25Freqs.keys()) | set(cop26Freqs.keys())

FFF_word_set = set(FFF2018Freqs.keys()) | set(FFF2019Freqs.keys()) | set(FFF2020Freqs.keys()) | set(FFF2021Freqs.keys())
all_word_set = COP_word_set | FFF_word_set

Save all word set to use in network analysis

In [8]:
with open(path + 'vocab.list', 'wb') as vocab_file:
 pickle.dump(all_word_set, vocab_file)

Create a dictionary with each of the words in the shared vocabulary and the number of occurences in each dataset

In [8]:
ALLcombined_dictionary = {}
for word in all_word_set:
    ALLcombined_dictionary[word] = [0,0,0,0,0,0,0,0,0,0,0]
    if word in cop20Freqs:
        ALLcombined_dictionary[word][0] = cop20Freqs[word]
    if word in cop21Freqs:
        ALLcombined_dictionary[word][1] = cop21Freqs[word]
    if word in cop22Freqs:
        ALLcombined_dictionary[word][2] = cop22Freqs[word]
    if word in cop23Freqs:
        ALLcombined_dictionary[word][3] = cop23Freqs[word]
    if word in cop24Freqs:
        ALLcombined_dictionary[word][4] = cop24Freqs[word]
    if word in cop25Freqs:
        ALLcombined_dictionary[word][5] = cop25Freqs[word]
    if word in cop26Freqs:
       ALLcombined_dictionary[word][6] = cop26Freqs[word]
    if word in FFF2018Freqs:
        ALLcombined_dictionary[word][7] = FFF2018Freqs[word]
    if word in FFF2019Freqs:
        ALLcombined_dictionary[word][8] = FFF2019Freqs[word]
    if word in FFF2020Freqs:
        ALLcombined_dictionary[word][9] = FFF2020Freqs[word]
    if word in FFF2021Freqs:
        ALLcombined_dictionary[word][10] = FFF2021Freqs[word]

Convert to a dataframe and filter out common words

In [9]:
ALLdata = pd.DataFrame.from_dict(ALLcombined_dictionary, orient='index', columns = ["COP20", "COP21", "COP22", 'COP23', "COP24", "COP25", "COP26",'FFF2018','FFF2019', 'FFF2020', 'FFF2021'])

In [10]:
filters=["cop","@cop20", "@cop21", "@cop22", "@cop23", "@cop24", "@cop25","@cop26", "#cop20", "#cop21", "#cop22", "#cop23", "#cop24", "#cop25","#cop26", "climate", "20", "21","22","23","24","25","26", "u"]
FFFfilters=["cop", "climate","18", "19" ,"20", "21", "fff", "#fridaysforfuture", "u"]

ALLfilters = filters + FFFfilters
ALLdata_filtered = ALLdata[ALLdata.index.isin(ALLfilters)==False]

Can save this dataframe (or load dataframe here)

In [None]:
#ALLdata_filtered.to_csv(path + "ALLwordCounts.csv") 

In [60]:
ALLdata_filtered = pd.read_csv(path + "ALLwordCounts.csv")
ALLdata_filtered.rename(columns = {'Unnamed: 0':'index'}, inplace = True)

Create a new dataframe with the values as percentages of occurences in each dataset. Each column (dataset) should sum to 100

In [19]:
ALLdata_pc=pd.DataFrame()
for i  in ALLdata_filtered.columns[0:12]:
    ALLcents = pd.DataFrame(ALLdata_filtered[i] / sum(ALLdata_filtered[i]) * 100)
    ALLdata_pc[i] = ALLcents
#data_pc['Topic'] = data_pc.index
ALLdata_pc = ALLdata_pc.reset_index()

In [22]:
ALLdata_pc.columns[1:12]

Index(['COP20', 'COP21', 'COP22', 'COP23', 'COP24', 'COP25', 'COP26',
       'FFF2018', 'FFF2019', 'FFF2020', 'FFF2021'],
      dtype='object')

Plot the most commonly occuring words in each dataset

In [None]:
ALLdata_pc['COP26'].sort_values(ascending = False).head(20).plot(kind = 'bar')

Sort normalised dataset based on one of the COPs and get just the top 100 rows (easier for plotting etc)

In [None]:
ALLdatapc_sorted=ALLdata_pc.sort_values(['COP23'], ascending = (False)).head(100)

In [None]:
ALLdatapc_sorted.head(25).plot(x = 'index', y = ["COP20", "COP21", "COP22", 'COP23', "COP24", "COP25", "COP26"], figsize=(17,7), kind = 'bar').set_ylabel("Percentage")

### Jensen Shannon Divergence (JSD)
Calculate the JSD for each pair of datasets using the percentages of occurences in each

In [23]:
ALLjsdL = []
for i in ALLdata_pc.columns[1:12]:
    ALLnl=[]
    for j in ALLdata_pc.columns[1:12]:
        js = jensenshannon(asarray(ALLdata_pc[i]), asarray(ALLdata_pc[j]))
        #jsdL[i] = js
        ALLnl.append(js)
    ALLjsdL.append(ALLnl)

ALLjsdM = round(pd.DataFrame(ALLjsdL, index=["COP20", "COP21", "COP22", 'COP23', "COP24", "COP25", "COP26",'FFF2018','FFF2019', 'FFF2020', 'FFF2021'], columns = ["COP20", "COP21", "COP22", 'COP23', "COP24", "COP25", "COP26",'FFF2018','FFF2019', 'FFF2020', 'FFF2021']),2)

In [27]:
df = ALLjsdM.where(np.triu(np.ones(ALLjsdM.shape)).astype(np.bool))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.


In [30]:
ALLjsdM.style.background_gradient(cmap='coolwarm', axis=None).format(precision = 2)

Unnamed: 0,COP20,COP21,COP22,COP23,COP24,COP25,COP26,FFF2018,FFF2019,FFF2020,FFF2021
COP20,0.0,0.41,0.44,0.44,0.44,0.46,0.48,0.55,0.55,0.58,0.56
COP21,0.41,0.0,0.39,0.39,0.39,0.41,0.41,0.5,0.5,0.55,0.52
COP22,0.44,0.39,0.0,0.36,0.39,0.41,0.46,0.54,0.53,0.57,0.55
COP23,0.44,0.39,0.36,0.0,0.35,0.37,0.42,0.52,0.51,0.55,0.53
COP24,0.44,0.39,0.39,0.35,0.0,0.35,0.4,0.48,0.48,0.54,0.51
COP25,0.46,0.41,0.41,0.37,0.35,0.0,0.39,0.49,0.47,0.52,0.49
COP26,0.48,0.41,0.46,0.42,0.4,0.39,0.0,0.47,0.44,0.51,0.44
FFF2018,0.55,0.5,0.54,0.52,0.48,0.49,0.47,0.0,0.37,0.5,0.44
FFF2019,0.55,0.5,0.53,0.51,0.48,0.47,0.44,0.37,0.0,0.47,0.41
FFF2020,0.58,0.55,0.57,0.55,0.54,0.52,0.51,0.5,0.47,0.0,0.45
