In [12]:
"""
General textual tone analysis procedure - using L&M
This program is based on the official codes provided by L&M

Author: Yuting
3 February 2020
"""

import pandas as pd
import numpy as np
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from Load_MasterDictionary import load_masterdictionary # from Generic_Parser.py
import string
from datetime import datetime
import matplotlib.pyplot as plt


In [14]:
%%time

test_type = 'history'

# load processed file
path_temp = "Temp/"
# impoort your data
df = pd.read_pickle(path_temp+test_type+'_Processed.pkl')
df = df[['date','CONTENT']]

if test_type == 'tweet':
    start_date = datetime.strptime("01/01/2020", '%d/%m/%Y')
#     end_date = datetime.strptime("26/05/2020", '%d/%m/%Y')
    df = df.loc[df['date'] >= start_date]
    df = df.reset_index()
else:
#     df.columns = ['time','source','text','country']
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# preview data
print(df.shape)

print("The first piece of text \n",df["CONTENT"][0])

df.head()


Using the Mac data path
(92542, 2)
The first piece of text 
 ['what', 'no', 'load', 'money', 'market', 'mutual', 'fund', 'pays', 'percent', 'since', 'this', 'inquiry', 'was', 'recieved', 'yields', 'on', 'an', 'annualized', 'basis', 'have', 'increased', 'for', 'money', 'market', 'mutual', 'funds', 'up', 'to', 'as', 'high', 'as', 'percent', 'for', 'some', 'according', 'to', 'donoghues', 'money', 'fund', 'report', 'box', 'holliston', 'mass', 'another', 'source', 'of', 'information', 'on', 'yields', 'is', 'fund', 'watch', 'regular', 'feature', 'of', 'money', 'magazine', 'about', 'money', 'market', 'funds', 'are', 'now', 'in', 'operation', 'for', 'more', 'information', 'on', 'features', 'such', 'as', 'free', 'check', 'writing', 'minimum', 'initial', 'deposits', 'etc', 'write', 'no', 'load', 'mutual', 'fund', 'association', 'inc', 'valley', 'forge', 'pa', 'few', 'of', 'the', 'major', 'money', 'market', 'funds', 'are', 'rowe', 'price', 'prime', 'reserve', 'fund', 'inc', 'delaware', 'cash', 'r

In [15]:
# define the tone analyzers, easy

# for L&M
MASTER_DICTIONARY_FILE = 'LoughranMcDonald_MasterDictionary_2018.csv'
lm_dictionary = load_masterdictionary(MASTER_DICTIONARY_FILE, True)

def lm_get_data(doc): # from "Generic_Parser.py"

    vdictionary = {}
    _odata = [0] * 9
    total_syllables = 0
    word_length = 0
    
    # if import processed data, comment this line
#     tokens = re.findall('\w+', doc)  # Note that \w+ splits hyphenated words
    tokens = doc
    for token in tokens:
        token = token.upper()
        if not token.isdigit() and len(token) > 1 and token in lm_dictionary:
            _odata[0] += 1  # word count
            word_length += len(token)
            if token not in vdictionary:
                vdictionary[token] = 1
            if lm_dictionary[token].positive: _odata[1] += 1
            if lm_dictionary[token].negative: _odata[2] += 1
            if lm_dictionary[token].uncertainty: _odata[3] += 1
            if lm_dictionary[token].litigious: _odata[4] += 1
            if lm_dictionary[token].weak_modal: _odata[5] += 1
            if lm_dictionary[token].moderate_modal: _odata[6] += 1
            if lm_dictionary[token].strong_modal: _odata[7] += 1
            if lm_dictionary[token].constraining: _odata[8] += 1
            total_syllables += lm_dictionary[token].syllables

    # Convert counts to % I don't need to
#     for i in range(3, 10 + 1):
#         _odata[i] = (_odata[i] / _odata[2])
    # Vocabulary
        
    return _odata

def lm_sentiment_analyzer(text):
    try:
#         text = text.upper()
        score = lm_get_data(text)
    except:
        score = [np.nan]*9
    return score

# for vader
def vader_sentiment_analyzer(text):
    score = SentimentIntensityAnalyzer().polarity_scores(text)
    return score['compound']

# for textblob
def textblob_sentiment_analyzer(text):
    score = TextBlob(text).sentiment.polarity
    return score

 ...Loading Master Dictionary 85000
Master Dictionary loaded from file: 
  LoughranMcDonald_MasterDictionary_2018.csv
  86,486 words loaded in master_dictionary.



In [16]:
%%time
data = []
for i in df['CONTENT']: ### text or text_first
    odata = lm_sentiment_analyzer(i)
    data.append(odata)
df_tones = pd.DataFrame(data,columns = ['total','positive','negative','uncertainty','litigious','weak_modal','moderate_modal','strong_modal','constraining'])
df_tones['date'] = df["date"]
print(df_tones.head())

   total  positive  negative  uncertainty  litigious  weak_modal  \
0    202         0         2            2          0           2   
1    807         5         3            3          0           3   
2    155         1         5            1          0           1   
3    626         6        22            7          0           5   
4    517         9        14            4          1           2   

   moderate_modal  strong_modal  constraining       date  
0               0             1             1 1980-01-02  
1               2             2             1 1980-01-02  
2               0             1             0 1980-01-03  
3               2             3             0 1980-01-04  
4               6             4             1 1980-01-07  
CPU times: user 2min 17s, sys: 30.4 s, total: 2min 48s
Wall time: 3min 8s


In [17]:
df_tones.to_csv(path_temp+test_type+'_LM_tones.csv')