In [1]:
!pip install tweet-preprocessor
!pip install gensim
!pip install python-Levenshtein
!pip install pycorenlp
!pip install sentistrength
!pip install wordsegment
!pip install autocorrect
!pip install truecase
#java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 5000

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [2]:
import pandas as pd
import numpy as np
import preprocessor as p
from gensim.parsing.preprocessing import remove_stopwords
import os
from pycorenlp import StanfordCoreNLP
import pickle
from sentistrength import PySentiStr
from tqdm import tqdm
import re
from wordsegment import load, segment
load()
from autocorrect import Speller
spell = Speller()
import truecase



In [6]:
def cleaning_tweet_senti(row):
    text = row['text']
    #Removes RT from the tweet. 
    text = re.sub('RT @[\w_]+: ', '', text)
    text = re.sub(r"\s+$", "", text)
    return text

In [8]:
def preprocessing_senti(doc_name):
    df = pd.read_csv(doc_name, lineterminator='\n')
    
    #dropping rows that contain na values
    df = df.dropna()
    
    #selecting tweets where language is english
    df = df[df['language'] == 'en']
    
    #selecting tweets that are job/intern offers
    df = df[~df.text.str.contains('Offre')]
    df = df[~df.text.str.contains('hiring')]
    df = df[~df.text.str.contains('hire')]
    df = df[~df.text.str.contains('Apply')]
    df = df[~df.text.str.contains('apply')]
    df = df[~df.text.str.contains('recruit')]
    
    #applying the preprocessing to the tweets
    df['text'] = df.apply(cleaning_tweet_senti, axis = 1)
    
    #dropping duplicates from the data
    df = df.drop_duplicates()
    
    return df
    

In [9]:
filenames = []
years = ['2011', '2012', '2013', '2014', '2015']
for year in years:
    for subdir, dirs, files in os.walk(r'Data Thesis/Oil companies/2011-2015/' + year):
        for filename in files:
            filepath = subdir + os.sep + filename
            #print(filepath)
            if filepath.endswith(".csv"):
                filenames.append(filepath)

In [10]:
filenames = sorted(filenames)

In [11]:
for name in filenames:
    print(name)

Data Thesis/Oil companies/2011-2015/2011/2011-01-03.csv
Data Thesis/Oil companies/2011-2015/2011/2011-01-10.csv
Data Thesis/Oil companies/2011-2015/2011/2011-01-17.csv
Data Thesis/Oil companies/2011-2015/2011/2011-01-24.csv
Data Thesis/Oil companies/2011-2015/2011/2011-01-31.csv
Data Thesis/Oil companies/2011-2015/2011/2011-02-07.csv
Data Thesis/Oil companies/2011-2015/2011/2011-02-14.csv
Data Thesis/Oil companies/2011-2015/2011/2011-02-21.csv
Data Thesis/Oil companies/2011-2015/2011/2011-02-28.csv
Data Thesis/Oil companies/2011-2015/2011/2011-03-07.csv
Data Thesis/Oil companies/2011-2015/2011/2011-03-14.csv
Data Thesis/Oil companies/2011-2015/2011/2011-03-21.csv
Data Thesis/Oil companies/2011-2015/2011/2011-03-28.csv
Data Thesis/Oil companies/2011-2015/2011/2011-04-04.csv
Data Thesis/Oil companies/2011-2015/2011/2011-04-11.csv
Data Thesis/Oil companies/2011-2015/2011/2011-04-18.csv
Data Thesis/Oil companies/2011-2015/2011/2011-04-25.csv
Data Thesis/Oil companies/2011-2015/2011/2011-05

In [12]:
print(filenames[53])

Data Thesis/Oil companies/2011-2015/2012/2012-01-09.csv


In [13]:
print(len(filenames))

218


In [14]:
nlp = StanfordCoreNLP('http://localhost:9000')
senti = PySentiStr()
senti.setSentiStrengthPath('/Users/Thimo/Desktop/Data Science and Society/Thesis/SentiStrengthCom.jar') # Note: Provide absolute path instead of relative path
senti.setSentiStrengthLanguageFolderPath('/Users/Thimo/Desktop/Data Science and Society/Thesis/SentStrength_Data_Sept2011/') # Note: Provide absolute path instead of relative path

## SentiStrength oil companies 2011-2015

In [334]:
s_oil_companies_senti = []
for file_name in tqdm(filenames):
    tweets = preprocessing_senti(file_name)
    week_sentiment = []
    for tweet in tweets['text']:
        #Sentiment using SentiStrength
        result_senti = senti.getSentiment(tweet, score = 'scale')
        week_sentiment.append(int(result_senti[0]))
    
    avg_sentiment = sum(week_sentiment)/len(week_sentiment)
    s_oil_companies_senti.append(avg_sentiment)
    
    

100%|██████████| 218/218 [3:34:54<00:00, 59.15s/it]  


In [335]:
print(min(s_oil_companies_senti), max(s_oil_companies_senti))

-0.11267605633802817 0.5540540540540541


In [336]:
with open("Data Thesis/Oil companies/2011-2015/week_sentiment_senti.txt", "wb") as fp:   #Pickling
    pickle.dump(s_oil_companies_senti, fp)

## Filenames think tanks 2011-2015

In [15]:
filenames_tt = []
years = ['2011', '2012', '2013', '2014', '2015']
for year in years:
    for subdir, dirs, files in os.walk(r'Data Thesis/Think Tanks/2011-2015/' + year):
        for filename in files:
            filepath = subdir + os.sep + filename
            #print(filepath)
            if filepath.endswith(".csv"):
                filenames_tt.append(filepath)

In [16]:
filenames_tt = sorted(filenames_tt)

In [17]:
for name in filenames_tt[146:147]:
    print(name)

Data Thesis/Think Tanks/2011-2015/2013/2013-10-21.csv


## SentiStrength think tanks 2011-2015

In [26]:
s_think_tanks_senti = []
r_opec_members2 = []
r_oil_terms2 = []
for file_name in tqdm(filenames_tt):
    tweets = preprocessing_senti(file_name)
    week_sentiment = []
    week_opec = 0
    week_oil = 0
    for tweet in tweets['text']:
        #references to oil
        if any(word in tweet for word in oil_terms):
            week_oil += 1
            
        #reference to opec members
        if any(word in tweet for word in opec_members):
            week_opec += 1
        
        #Sentiment using SentiStrength
        result_senti = senti.getSentiment(tweet, score = 'scale')
        week_sentiment.append(int(result_senti[0]))
    
    avg_sentiment = sum(week_sentiment)/len(week_sentiment)
    s_think_tanks_senti.append(avg_sentiment)
    r_opec_members2.append(week_opec)
    r_oil_terms2.append(week_oil)

100%|██████████| 218/218 [5:45:25<00:00, 95.07s/it]   


In [27]:
print(min(s_think_tanks_senti), max(s_think_tanks_senti))

-0.4713216957605985 0.06504065040650407


In [28]:
print(min(r_opec_members2), max(r_opec_members2))

0 8


In [29]:
print(min(r_oil_terms2), max(r_oil_terms2))

0 15


In [34]:
with open("Data Thesis/Think Tanks/2011-2015/week_sentiment_senti.txt", "wb") as fp:   #Pickling
    pickle.dump(s_think_tanks_senti, fp)

## Filenames extension dataset

In [35]:
filenames_ext = []
years = ['2016', '2017', '2018', '2019', '2020']
for year in years:
    for subdir, dirs, files in os.walk(r'Data Thesis/Oil Companies/2016-2020/' + year):
        for filename in files:
            filepath = subdir + os.sep + filename
            #print(filepath)
            if filepath.endswith(".csv"):
                filenames_ext.append(filepath)

In [36]:
filenames_ext = sorted(filenames_ext)

In [37]:
for name in filenames_ext:
    print(name)

Data Thesis/Oil Companies/2016-2020/2016/2015-03-09.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-03-16.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-03-23.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-03-30.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-04-06.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-04-13.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-04-20.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-04-27.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-05-04.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-05-11.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-05-18.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-05-25.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-06-01.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-06-08.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-06-15.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-06-22.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-06-29.csv
Data Thesis/Oil Companies/2016-2020/2016/2015-07

## SentiStrength oil companies 2016-2020

In [42]:
s_oil_companies_senti_ext = []
for file_name in tqdm(filenames_ext):
    tweets = preprocessing_senti(file_name)
    week_sentiment = []
    for tweet in tweets['text']:
        #Sentiment using SentiStrength
        result_senti = senti.getSentiment(tweet, score = 'scale')
        week_sentiment.append(int(result_senti[0]))
    
    avg_sentiment = sum(week_sentiment)/len(week_sentiment)
    s_oil_companies_senti_ext.append(avg_sentiment)

100%|██████████| 304/304 [6:48:57<00:00, 80.72s/it]   


In [49]:
print(min(s_oil_companies_senti_ext), max(s_oil_companies_senti_ext))

0.005747126436781609 0.7310924369747899


In [50]:
with open("Data Thesis/Oil companies/2016-2020/week_sentiment_senti_ext.txt", "wb") as fp:   #Pickling
    pickle.dump(s_oil_companies_senti_ext, fp)

## Filenames think tanks

In [43]:
filenames_ext_tt = []
years = ['2016', '2017', '2018', '2019', '2020']
for year in years:
    for subdir, dirs, files in os.walk(r'Data Thesis/Think Tanks/2016-2020/' + year):
        for filename in files:
            filepath = subdir + os.sep + filename
            #print(filepath)
            if filepath.endswith(".csv"):
                filenames_ext_tt.append(filepath)

In [44]:
filenames_ext_tt = sorted(filenames_ext_tt)

In [46]:
for name in filenames_ext_tt:
    print(name)

Data Thesis/Think Tanks/2016-2020/2016/2015-03-09.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-03-16.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-03-23.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-03-30.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-04-06.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-04-13.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-04-20.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-04-27.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-05-04.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-05-11.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-05-18.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-05-25.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-06-01.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-06-08.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-06-15.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-06-22.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-06-29.csv
Data Thesis/Think Tanks/2016-2020/2016/2015-07-06.csv
Data Thesis/Think Tanks/2016

## SentiStrength Think Tanks 2016-2020

In [56]:
s_think_tanks_senti_ext = []
r_opec_members2_ext = []
r_oil_terms2_ext = []
for file_name in tqdm(filenames_ext_tt):
    tweets = preprocessing_senti(file_name)
    week_sentiment = []
    week_opec = 0
    week_oil = 0
    for tweet in tweets['text']:
        #references to oil
        if any(word in tweet for word in oil_terms):
            week_oil += 1
            
        #reference to opec members
        if any(word in tweet for word in opec_members):
            week_opec += 1
        
        #Sentiment using SentiStrength
        result_senti = senti.getSentiment(tweet, score = 'scale')
        week_sentiment.append(int(result_senti[0]))
    
    avg_sentiment = sum(week_sentiment)/len(week_sentiment)
    s_think_tanks_senti_ext.append(avg_sentiment)
    r_opec_members2_ext.append(week_opec)
    r_oil_terms2_ext.append(week_oil)

100%|██████████| 304/304 [18:01:27<00:00, 213.45s/it]  


In [57]:
print(min(s_think_tanks_senti_ext), max(s_think_tanks_senti_ext))

-0.5844298245614035 -0.0945054945054945


In [58]:
print(min(r_opec_members2_ext), max(r_opec_members2_ext))

0 11


In [63]:
print(min(r_oil_terms_ext), max(r_oil_terms_ext))

0 41


In [62]:
with open("Data Thesis/Think Tanks/2016-2020/week_sentiment_senti_ext.txt", "wb") as fp:   #Pickling
    pickle.dump(s_think_tanks_senti_ext, fp)


# Preprocessor of CoreNLP

In [250]:
def process_raw_tweet(tweet):
    tweet = tweet['text']
    # Convert to lower case
    tweet = tweet.lower()
    # Convert www.* or https?://* to ''
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', tweet)
    # Convert @username to ''
    tweet = re.sub('@[^\s]+', '', tweet)
    
    # Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    # Replace #word with word
    # Need more detail sub, #SoUgly => so ugly
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    # Replace punctuation
    tweet = re.sub('[^\w\s]', '', tweet)
    # Remove numbers
    tweet = re.sub(r"\d", "", tweet)
    
    split_tweet = tweet.split()
    for word in split_tweet:
        if (any(vowel in word for vowel in 'aeiou')) == False:
            tweet = re.sub(word, '', tweet)
        if word == 'mmbtu':
            tweet = re.sub(word, '', tweet)
        if word == 'mmbblday':
            tweet = re.sub(word, '', tweet)
        if (any(day in word for day in ['mon', 'thurs', 'fri'])) == True:
            tweet = re.sub(word, word + 'day', tweet)
        if word == 'wed':
            tweet = re.sub(word, 'wednesday', tweet)
        if word == 'sat':
            tweet = re.sub(word, 'saturday', tweet)
        if word == 'tue' or word == 'tues':
            tweet = re.sub(word, 'tuesday', tweet)
        if word == 'natgas':
            tweet = re.sub(word, 'natural gas', tweet)
        if word == 'avg':
            tweet = re.sub(word, 'average', tweet)
        if word == 'prev':
            tweet = re.sub(word, 'previous', tweet)
        if word == 'stat' or word == 'stats':
            tweet = re.sub(word, 'statistics', tweet)
        if word == 'jan':
            tweet = re.sub(word, 'january', tweet)
        if word == 'feb':
            tweet = re.sub(word, 'february', tweet)
        if word == 'aug':
            tweet = re.sub(word, 'august', tweet)
        if word == 'sept':
            tweet = re.sub(word, 'september', tweet)
        if word == 'oct':
            tweet = re.sub(word, 'october', tweet)
        if word == 'nov':
            tweet = re.sub(word, 'november', tweet)
        if word == 'dec':
            tweet = re.sub(word, 'december', tweet)
    tweet = truecase.get_true_case(tweet)
        
    # trim
    tweet = tweet.strip('\'"')
    return tweet

In [210]:
#preprocessor 1
def preprocessing_corenlp2(doc_name):
    df = pd.read_csv(doc_name, lineterminator='\n')
    
    #dropping rows that contain na values
    df = df.dropna()
    
    #dropping duplicates from the data
    df = df.drop_duplicates()
    
    #selecting tweets where language is english
    df = df[df['language'] == 'en']
    
    #selecting tweets that are job/intern offers
    df = df[~df.text.str.contains('Offre')]
    
    #applying the preprocessing to the tweets
    df['text'] = df.apply(process_raw_tweet, axis = 1)
    
    df = df[~df.text.str.contains('hiring')]
    df = df[~df.text.str.contains('hire')]
    df = df[~df.text.str.contains('apply')]
    df = df[~df.text.str.contains('recruit')]
    df = df[~df.text.str.contains('new jobs')]
    df = df[~df.text.str.contains('career')]
    
    #dropping duplicates from the data
    df = df.drop_duplicates()
    
    return df

## Core NLP Oil Companies 2011-2015

In [None]:
opec_members = ['iran', 'iraq', 'kuwait', 'saudi arabia', 'venezuela', 'qatar', 'indonesia', 'libya', 'uae', 'united arab emirates', 
               'algeria', 'nigeria', 'ecuador', 'gabon', 'angola', 'guinea', 'equatorial guinea', 'congo']

oil_terms = ['oil', 'brent', 'petroleum', 'fuel', 'gasoline', 'kerosene', 'petrol', 'grease', 'lubricant', 'black gold']

In [262]:
s_oil_companies_nlp2 = []
for file_name in tqdm(filenames):
    tweets = preprocessing_corenlp2(file_name)
    week_sentiment = []
    for tweet in tweets['text']:
        #Sentiment using Stanford CoreNLP
        result_nlp = nlp.annotate(tweet,
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 10000,
                   })
        for s in result_nlp["sentences"]:
            week_sentiment.append(int(s['sentimentValue']))
        
    avg_sentiment = sum(week_sentiment)/len(week_sentiment)
    s_oil_companies_nlp2.append(avg_sentiment)

100%|██████████| 218/218 [1:10:56<00:00, 19.53s/it]


In [264]:
with open("Data Thesis/Oil Companies/2011-2015/week_sentiment_corenlp_new.txt", "wb") as fp:   #Pickling
    pickle.dump(s_oil_companies_nlp2, fp)

In [263]:
print(min(s_oil_companies_nlp2), max(s_oil_companies_nlp2))

1.98 2.293051359516616


## Core NLP Think Tanks 2011-2015

In [267]:
s_think_tanks_nlp2 = []
r_opec_members3 = []
r_oil_terms3 = []
for file_name in tqdm(filenames_tt):
    tweets = preprocessing_corenlp2(file_name)
    week_sentiment = []
    week_opec = 0
    week_oil = 0
    for tweet in tweets['text']:
        #references to oil
        if any(word in tweet.lower() for word in oil_terms):
            week_oil += 1
            
        #reference to opec members
        if any(word in tweet.lower() for word in opec_members):
            week_opec += 1
        
        #Sentiment using Stanford CoreNLP
        result_nlp = nlp.annotate(tweet,
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 10000,
                   })
        for s in result_nlp["sentences"]:
            week_sentiment.append(int(s['sentimentValue']))
        
    avg_sentiment = sum(week_sentiment)/len(week_sentiment)
    s_think_tanks_nlp2.append(avg_sentiment)
    r_opec_members3.append(week_opec)
    r_oil_terms3.append(week_oil)

100%|██████████| 218/218 [1:53:32<00:00, 31.25s/it]


In [280]:
with open("Data Thesis/Think Tanks/2011-2015/week_sentiment_corenlp_new.txt", "wb") as fp:   #Pickling
    pickle.dump(s_think_tanks_nlp2, fp)

In [268]:
print(min(s_think_tanks_nlp2), max(s_think_tanks_nlp2))

1.8402777777777777 2.0671140939597317


In [163]:
with open("Data Thesis/Think Tanks/2011-2015/week_sentiment_corenlp.txt", "rb") as fp:   # Unpickling
    eufp_nlp = pickle.load(fp)

In [164]:
print(min(eufp_nlp), max(eufp_nlp))

1.8486486486486486 2.1320754716981134


In [269]:
print(min(r_oil_terms3), max(r_oil_terms3))

0 15


In [270]:
print(min(r_opec_members3), max(r_opec_members3))

1 76


## Core NLP Oil Companies 2016-2020

In [279]:
s_oil_companies_nlp3 = []
for file_name in tqdm(filenames_ext):
    tweets = preprocessing_corenlp2(file_name)
    week_sentiment = []
    for tweet in tweets['text']:
        #Sentiment using Stanford CoreNLP
        result_nlp = nlp.annotate(tweet,
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 10000,
                   })
        for s in result_nlp["sentences"]:
            week_sentiment.append(int(s['sentimentValue']))
        
    avg_sentiment = sum(week_sentiment)/len(week_sentiment)
    s_oil_companies_nlp3.append(avg_sentiment)

100%|██████████| 304/304 [5:35:09<00:00, 66.15s/it]   


In [281]:
print(min(s_oil_companies_nlp3), max(s_oil_companies_nlp3))

1.7897959183673469 2.4219409282700424


In [285]:
with open("Data Thesis/Oil Companies/2016-2020/week_sentiment_corenlp_ext_new.txt", "wb") as fp:   #Pickling
    pickle.dump(s_oil_companies_nlp3, fp)

In [282]:
with open("Data Thesis/Oil companies/2016-2020/week_sentiment_corenlp_ext.txt", "rb") as fp:   # Unpickling
    oil_nlp = pickle.load(fp)

In [283]:
print(min(oil_nlp), max(oil_nlp))

2.0045454545454544 2.5411585365853657


## Core NLP Think Tanks 2016-2020

In [287]:
s_think_tanks_nlp3 = []
r_opec_members4 = []
r_oil_terms4 = []
for file_name in tqdm(filenames_ext_tt):
    tweets = preprocessing_corenlp2(file_name)
    week_sentiment = []
    week_opec = 0
    week_oil = 0
    for tweet in tweets['text']:
        #references to oil
        if any(word in tweet.lower() for word in oil_terms):
            week_oil += 1
            
        #reference to opec members
        if any(word in tweet.lower() for word in opec_members):
            week_opec += 1
        
        #Sentiment using Stanford CoreNLP
        result_nlp = nlp.annotate(tweet,
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 10000,
                   })
        for s in result_nlp["sentences"]:
            week_sentiment.append(int(s['sentimentValue']))
        
    avg_sentiment = sum(week_sentiment)/len(week_sentiment)
    s_think_tanks_nlp3.append(avg_sentiment)
    r_opec_members4.append(week_opec)
    r_oil_terms4.append(week_oil)

100%|██████████| 304/304 [12:00:31<00:00, 142.21s/it]  


In [290]:
print(min(s_think_tanks_nlp3), max(s_think_tanks_nlp3))

1.7977207977207976 1.9967741935483871


In [293]:
with open("Data Thesis/Think Tanks/2016-2020/week_sentiment_corenlp_ext_new.txt", "wb") as fp:   #Pickling
    pickle.dump(s_oil_companies_nlp3, fp)

In [291]:
print(min(r_opec_members4), max(r_opec_members4))

8 124


In [292]:
print(min(r_oil_terms4), max(r_oil_terms4))

0 41


## Converting absolute frequencies to percentages 

In [171]:
no_tweets_tt = []
for file_name in tqdm(filenames_tt):
    df = pd.read_csv(file_name, lineterminator='\n')
    no_tweets_tt.append(len(df))

100%|██████████| 218/218 [00:01<00:00, 195.75it/s]


In [294]:
no_tweets_tt_ext = []
for file_name in tqdm(filenames_ext_tt):
    df = pd.read_csv(file_name, lineterminator='\n')
    no_tweets_tt_ext.append(len(df))

100%|██████████| 304/304 [00:03<00:00, 94.65it/s] 


In [186]:
print(no_tweets_tt)

[122, 134, 185, 157, 248, 185, 174, 180, 204, 250, 255, 310, 233, 198, 216, 184, 129, 194, 324, 279, 371, 285, 268, 298, 197, 520, 171, 284, 216, 295, 187, 188, 121, 164, 125, 223, 220, 324, 293, 275, 286, 264, 501, 233, 233, 324, 326, 302, 349, 366, 219, 73, 139, 243, 289, 289, 328, 276, 281, 329, 371, 435, 322, 374, 279, 278, 230, 570, 425, 348, 435, 474, 371, 546, 358, 343, 531, 494, 371, 364, 311, 402, 357, 256, 239, 258, 255, 353, 474, 457, 393, 455, 466, 727, 395, 358, 466, 491, 467, 540, 519, 455, 386, 135, 174, 363, 404, 482, 665, 484, 467, 361, 477, 493, 468, 620, 545, 432, 494, 515, 549, 434, 521, 431, 403, 574, 743, 540, 564, 497, 469, 423, 394, 311, 336, 334, 311, 294, 353, 417, 506, 606, 515, 511, 521, 646, 681, 640, 639, 633, 681, 806, 763, 641, 623, 160, 174, 479, 560, 709, 746, 620, 622, 834, 717, 759, 780, 747, 731, 1004, 706, 586, 476, 516, 650, 759, 631, 691, 916, 618, 621, 783, 598, 721, 631, 597, 439, 443, 383, 448, 466, 693, 825, 758, 634, 766, 968, 840, 825, 674,

In [295]:
print(no_tweets_tt_ext)

[860, 951, 794, 662, 608, 786, 995, 777, 1125, 899, 1026, 926, 969, 880, 1023, 942, 958, 794, 921, 698, 600, 543, 472, 469, 621, 706, 965, 1050, 1019, 1081, 1061, 959, 1113, 1134, 1077, 1198, 1047, 934, 1119, 897, 980, 387, 323, 629, 801, 842, 1053, 897, 962, 1021, 1025, 855, 820, 965, 632, 690, 800, 945, 1128, 1077, 831, 980, 912, 1211, 1276, 1077, 1454, 1274, 1427, 1593, 1076, 840, 693, 675, 622, 533, 606, 626, 1011, 964, 979, 1121, 1119, 1129, 1066, 1020, 1163, 1279, 1172, 1220, 1368, 1463, 1227, 878, 557, 722, 1033, 1027, 1115, 1046, 1191, 1215, 1286, 1151, 1186, 1251, 1378, 1333, 1229, 1081, 1183, 1331, 1137, 1269, 1232, 1192, 1437, 1277, 1265, 1523, 1542, 1397, 1162, 1086, 1023, 938, 882, 776, 923, 990, 1164, 1289, 1431, 1168, 1294, 1481, 1579, 1272, 1156, 1545, 1429, 1676, 1591, 1487, 1225, 1108, 670, 846, 1176, 1198, 1385, 1177, 1064, 1004, 1143, 1056, 1082, 1013, 1040, 941, 701, 1092, 1056, 957, 848, 904, 1046, 1003, 1128, 1132, 1067, 1253, 1157, 930, 1081, 833, 744, 771, 697,

In [271]:
arr_oil_terms = np.asarray(r_oil_terms3)
arr_no_tweets_tt = np.asarray(no_tweets_tt)

In [272]:
per_oil_terms = arr_oil_terms / arr_no_tweets_tt

In [273]:
per_oil_terms_list = per_oil_terms.tolist()

In [274]:
with open("Data Thesis/Think Tanks/2011-2015/percentage_references_oil_new.txt", "wb") as fp:   #Pickling
    pickle.dump(per_oil_terms_list, fp)

In [275]:
arr_opec_members = np.asarray(r_opec_members3)

In [276]:
per_opec_members = arr_opec_members / arr_no_tweets_tt

In [277]:
per_opec_members_list = per_opec_members.tolist()

In [278]:
with open("Data Thesis/Think Tanks/2011-2015/percentage_references_opec_new.txt", "wb") as fp:   #Pickling
    pickle.dump(per_opec_members_list, fp)

In [296]:
arr_oil_terms_ext = np.asarray(r_oil_terms4)
arr_no_tweets_tt_ext = np.asarray(no_tweets_tt_ext)

In [297]:
per_oil_terms_ext = arr_oil_terms_ext / arr_no_tweets_tt_ext

In [299]:
per_oil_terms_ext_list = per_oil_terms_ext.tolist()

In [300]:
with open("Data Thesis/Think Tanks/2016-2020/percentage_references_oil_new.txt", "wb") as fp:   #Pickling
    pickle.dump(per_oil_terms_ext_list, fp)

In [301]:
arr_opec_members_ext = np.asarray(r_opec_members4)

In [302]:
per_opec_members_ext = arr_opec_members_ext / arr_no_tweets_tt_ext

In [303]:
per_opec_members_ext_list = per_opec_members_ext.tolist()

In [304]:
with open("Data Thesis/Think Tanks/2016-2020/percentage_references_opec_new.txt", "wb") as fp:   #Pickling
    pickle.dump(per_opec_members_ext_list, fp)