# Paper L Text analysis

## Import from Excel

In [1]:
import pandas as pd

In [2]:
data = pd.read_excel('input-file.xlsx', sheetname='Sheet3')

In [3]:
data.head(3)

Unnamed: 0,ISIN,year,All notes,empty
0,AT0000741053,5,"A provision of TEUR 19,493.0 (previous year: T...",
1,AT0000741053,6,Waste disposal or land restoration requirement...,
2,AT0000741053,7,Waste disposal or land restoration requirement...,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 993 entries, 0 to 992
Data columns (total 4 columns):
ISIN         993 non-null object
year         993 non-null int64
All notes    993 non-null object
empty        0 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 31.1+ KB


In [5]:
import numpy as np

In [6]:
def concat(policy,note):
    text = 'NA'
    if(type(policy) == str):
        text = policy
        
    if(type(note) == str):
        
        if(text != 'NA'):
            text += '. ' + note
        else:
            text = note
    return text

In [7]:
data['Text']= data.apply(lambda row: concat(row['All notes'], row['empty']), axis=1)

In [8]:
data.Text.describe()

count                                                   993
unique                                                  974
top        Provisions for environmental restoration, res...
freq                                                      4
Name: Text, dtype: object

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 993 entries, 0 to 992
Data columns (total 5 columns):
ISIN         993 non-null object
year         993 non-null int64
All notes    993 non-null object
empty        0 non-null float64
Text         993 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 38.9+ KB


In [10]:
data.Text.head()

0    A provision of TEUR 19,493.0 (previous year: T...
1    Waste disposal or land restoration requirement...
2    Waste disposal or land restoration requirement...
3    Waste disposal or land restoration requirement...
4    Waste disposal or land restoration requirement...
Name: Text, dtype: object

## NLTK

### Tokenization

In [11]:
import nltk
#nltk.download()

In [12]:
nltk.sent_tokenize(data.Text.head()[0])

['A provision of TEUR 19,493.0 (previous year: TEUR 16,259.5) was made for environmental and hazardous waste risks.']

In [13]:
def sentenceTokenize(paragraph):
    
    l_s = nltk.sent_tokenize(paragraph.lower())
    
    d_s =  {}
    for sent in l_s:
        
        if ('rate' in sent):
            
            count = sent.count('rate')
            if(sent not in d_s):
                d_s[sent] = count
            else:
                d_s[sent] += count

    sent_count = 0
    sent_string = ''
    
    for key in d_s:
        
        sent_count += d_s[key]
        sent_string += key
        
    return d_s,sent_string,sent_count

In [14]:
data[['rate_d','rate_sentences','rate_count']] = data.Text.apply(lambda paragraph: pd.Series(sentenceTokenize(paragraph)))

In [15]:
data['rate_count'].describe()

count    993.000000
mean       2.604230
std        4.465534
min        0.000000
25%        0.000000
50%        1.000000
75%        3.000000
max       37.000000
Name: rate_count, dtype: float64

In [16]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import string

stop = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()

In [17]:
def tokenize(fragment):
    
    fragments = nltk.sent_tokenize(fragment)
    tokens = fragments
    #Removes punctuation, paranthesis etc.
    #tokens = re.sub(r'[^\w\s]', ' ', tokens)
    #Makes lower case
    #tokens = tokens.lower()
    #Makes each word into a token in the sentece
    tokens = [word_tokenize(fragment) for fragment in fragments]
    #Removes english stopwords
    #tokens = list(set(tokens) - stop)
    #Lemmatizes each word
    return tokens

In [18]:
data['Tokens'] = data.Text.apply(lambda text: tokenize(str(text)))

In [19]:
data.Tokens.head()

0    [[A, provision, of, TEUR, 19,493.0, (, previou...
1    [[Waste, disposal, or, land, restoration, requ...
2    [[Waste, disposal, or, land, restoration, requ...
3    [[Waste, disposal, or, land, restoration, requ...
4    [[Waste, disposal, or, land, restoration, requ...
Name: Tokens, dtype: object

In [20]:
data.Text.head()

0    A provision of TEUR 19,493.0 (previous year: T...
1    Waste disposal or land restoration requirement...
2    Waste disposal or land restoration requirement...
3    Waste disposal or land restoration requirement...
4    Waste disposal or land restoration requirement...
Name: Text, dtype: object

## Creating Sentiment

In [21]:
lmPanda = pd.read_csv('LoughranMcDonald_MasterDictionary_2016.csv')
sentCols = ['Negative', 'Positive', 'Uncertainty', 'Litigious', 'Constraining', 'Superfluous', 'Interesting', 'Modal']
sentDict = {}

In [22]:
for col in sentCols:
    sentDict[col] = set([x.lower() for x in lmPanda[lmPanda[col] != 0]['Word'].tolist()])
all_lists = data.Tokens.tolist()
listofTokens = [val for sublist in all_lists for val in sublist]
listofTokens = [val for sublist in listofTokens for val in sublist]

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Creates a table with the idf scores with each word
def idf(fragments):
    vectorizer = TfidfVectorizer(min_df=1)
    X = vectorizer.fit_transform(fragments)
    idf = vectorizer.idf_
    idfDict = dict(zip(vectorizer.get_feature_names(), idf))
    return  idfDict

In [24]:
#Creats the table with the idf scores
idfDict = idf(listofTokens)

In [25]:
from collections import Counter
def LMsent(tokens):
    #Rules
    
    tokens = [val for sublist in tokens for val in sublist]
    
    tokenslen = len(tokens)
    removeIndexes = []
    if 'effective' in set(tokens):
        for index in range(tokenslen - 1):
            if ( tokens[index] == 'effective') & (tokens[index + 1] in ('income', 'tax', 'rate') ):
                removeIndexes.append(index)
                print('rule 1 applied')
    if 'efficiency' in set(tokens):
        for index in range(tokenslen - 1):
            if ( tokens[index] == 'efficiency' ) & ( tokens[index + 1] in ('ratio') ):
                removeIndexes.append(index)
                print('rule 2 applied')

    #Removes the words by indexes
    if not removeIndexes:
        #print tokens
        for index in removeIndexes:
            tokens.pop(index)
        #print tokens

    returnDict = {}
    for key in sentDict.keys():
        returnDict[key] = 0.0

    #Makes the tokens into a dict of counts
    counts = Counter(tokens)

    if len(counts) != 0:  # List of tokens might be empty
        # The token might be in several sets
        tokensCount = 0
        for token in counts:
            token = token.lower()
            for key in sentDict.keys():
                # The token might be in many sets
                if token in sentDict[key]:
                    try:
                        #Fetches the already calculated idfscore
                        idfScore = idfDict[token]
                        #Calculates the tfidfScore
                        tfidf = idfScore*counts[token]/len(counts)
                        returnDict[key] += tfidf
                    except Exception as e:
                        print(e)
                        print(token)
    return returnDict

In [26]:
#Returns the results ordered
def LMs(tokens):
    sentDict = LMsent(tokens)
    sentList = []
    for key in sentCols:
        sentList.append(sentDict[key])
    return sentList

In [27]:
data.Tokens.head().apply(lambda note: LMs(note))

0    [0.562314729628, 0.0, 0.419383576649, 0.0, 0.0...
1    [0.774004424554, 0.0, 0.0, 0.323948061017, 0.7...
2    [0.741754240198, 0.0, 0.0, 0.310450225142, 0.7...
3    [0.741754240198, 0.0, 0.0, 0.310450225142, 0.7...
4    [0.741754240198, 0.0, 0.0, 0.310450225142, 0.7...
Name: Tokens, dtype: object

In [28]:
sentCol = [col  for col in sentCols]

In [29]:
data[sentCol] = data.Tokens.apply(lambda note_tokens: pd.Series(LMs(note_tokens)))

rule 1 applied
rule 1 applied
rule 1 applied
rule 1 applied


In [30]:
data[sentCol].head()

Unnamed: 0,Negative,Positive,Uncertainty,Litigious,Constraining,Superfluous,Interesting,Modal
0,0.562315,0.0,0.419384,0.0,0.0,0.0,0.0,0.0
1,0.774004,0.0,0.0,0.323948,0.764072,0.0,0.0,0.0
2,0.741754,0.0,0.0,0.31045,0.732236,0.0,0.0,0.0
3,0.741754,0.0,0.0,0.31045,0.732236,0.0,0.0,0.0
4,0.741754,0.0,0.0,0.31045,0.732236,0.0,0.0,0.0
