# Testing Sentiment Predictability via Political Language
This is a updated version of an old project I worked on testing the idea that the Language of different Political Voices had different language usage. The idea for this was inspired by **The Righteous Mind** by *Johnathan Haidt* which I read quite a few years ago. Part of the book included the claim that one could detect the political alignment based on the words used and this was my attempt to replicate it by concept alone.

This updated version includes a proper statistical test as opposed to the simple graphing done in the previous version.
Unsurprising, images are useful for many things but justifying a hypothesis is not one of them.

In [23]:
import re
from pathlib import Path

import tika
from tika import parser

import pandas as pd
import scipy as sp

In [3]:
# start the tika VM to pull data from the pdfs
tika.initVM()


# init pathing.
dataPath = Path('__data')
burkePath = dataPath/'burke'
voltairePath = dataPath/'voltaire'
painePath = dataPath/'paine'

# This was pulled from the R programming language since this is where I know it existed.
nrcFrame = dataPath/'ncr.csv'

In [4]:
def extractFromPdfs(inputPath, outFile):
    with open(outFile, 'a+') as out:
        glob = inputPath.glob("*.pdf")

        for f in glob:
            try:
                # get the data from the server
                print(f"Trying to parse file {f}.")
                parsed = parser.from_file( str(f) )

                # files are broken into metadata, content
                # filter for usable terms
                print(f"Pull the Words out of {f}.")
                terms = re.findall('\w+', parsed['content'])

                # write them to the file for usage
                print(f"Writing words to output file.")
                for term in terms:
                    out.write( term + ',\n')
            except:
                print( "The file " + f + " encountered an error.")

In [47]:
# Create the word lists for each thinker
extractFromPdfs(burkePath, 'burke.outputs')
extractFromPdfs(painePath, 'paine.outputs')
extractFromPdfs(voltairePath, 'voltaire.outputs')

Trying to parse file __data/voltaire/worksofvoltairec10volt.pdf.
Pull the Words out of __data/voltaire/worksofvoltairec10volt.pdf.
Writing words to output file.
Trying to parse file __data/voltaire/worksofvoltairec14volt.pdf.
Pull the Words out of __data/voltaire/worksofvoltairec14volt.pdf.
Writing words to output file.
Trying to parse file __data/voltaire/worksofvoltairec07volt.pdf.
Pull the Words out of __data/voltaire/worksofvoltairec07volt.pdf.
Writing words to output file.
Trying to parse file __data/voltaire/worksofvoltairec04volt.pdf.
Pull the Words out of __data/voltaire/worksofvoltairec04volt.pdf.
Writing words to output file.


In [5]:
# Mark each word in the sentiment for counting
ncr = pd.read_csv(nrcFrame)
ncr['count'] = 1
ncr


Unnamed: 0,word,sentiment,count
0,abacus,trust,1
1,abandon,fear,1
2,abandon,negative,1
3,abandon,sadness,1
4,abandoned,anger,1
...,...,...,...
13867,zest,anticipation,1
13868,zest,joy,1
13869,zest,positive,1
13870,zest,trust,1


In [16]:
# Merge burke and ncr for the test
burke = pd.read_csv('burke.outputs', names=['word', 'drop']).drop('drop', axis=1)
burkeCounted = burke.merge(ncr, how='inner')\
    .groupby('sentiment')\
    .sum('count')\
    .sort_values('count', ascending=False)\
    .reset_index().T # I just like transposing since it looks nicer

In [17]:
# same but for voltaire
voltaire = pd.read_csv('voltaire.outputs', names=['word', 'drop']).drop('drop', axis=1)
voltaireCounted = voltaire.merge(ncr, how='inner')\
    .groupby('sentiment')\
    .sum('count')\
    .sort_values('count', ascending=False)\
    .reset_index().T

In [18]:
# same but for paine
paine = pd.read_csv('paine.outputs', names=['word', 'drop']).drop('drop', axis=1)
paineCounted = paine.merge(ncr, how='inner')\
    .groupby('sentiment')\
    .sum('count')\
    .sort_values('count', ascending=False)\
    .reset_index().T

In [168]:
# 


In [22]:
# collect all the data for the test
colnames = paineCounted.iloc[0].tolist()
ct = pd.DataFrame([burkeCounted.iloc[1], voltaireCounted.iloc[1], paineCounted.iloc[1]])
ct.columns=colnames
ct.index = ['burke', 'voltaire', 'paine']

ct

Unnamed: 0,positive,negative,trust,anticipation,fear,anger,joy,sadness,disgust,surprise
burke,30511,21607,20445,12585,12337,9715,9368,9119,5932,5024
voltaire,32530,21227,20682,13517,12629,11914,10535,10098,7056,5716
paine,18375,13770,12851,8439,8121,6372,6348,6256,3767,3573


In [26]:
# Test if the language sentiment is detectibly different
result = sp.stats.chisquare(ct)

In [27]:
print((
    f"The Chi-Squared statistic is {round(result.statistic[0],2)}"
    f" and the p-value is {result.pvalue[0]:.3E}."
))

The Chi-Squared statistic is 4320.07 and the p-value is 0.000E+00.


In [28]:
# with these values, we fail to reject the null hypothesis

## Does this change when I randomly sample from the data frame?
We shouldn't do this after the fact and should in fact sample as a matter of methodology. But, oh well.
This is a personal project and not a submitted paper so let's play around.

In [29]:
sampleCount = 30000
burkeSampled = burke.sample(sampleCount)
voltaireSampled = voltaire.sample(sampleCount)
paineSampled = paine.sample(sampleCount)

In [30]:
burkeCounted = burkeSampled.merge(ncr, how='inner').groupby('sentiment').sum('count').sort_values('count', ascending=False).reset_index().T
voltaireCounted = voltaireSampled.merge(ncr, how='inner').groupby('sentiment').sum('count').sort_values('count', ascending=False).reset_index().T
paineCounted = paineSampled.merge(ncr, how='inner').groupby('sentiment').sum('count').sort_values('count', ascending=False).reset_index().T

In [32]:
# just to sanity check the lengths some;
# words get dropped since they are not in our sentiment dictionary
len(burkeSampled.merge(ncr, how='inner')), len(voltaireSampled.merge(ncr, how='inner')), len(paineSampled.merge(ncr, how='inner'))

(6952, 7129, 6673)

In [33]:
ct = pd.DataFrame([burkeCounted.iloc[1], voltaireCounted.iloc[1], paineCounted.iloc[1]])
ct.columns=colnames
ct.index = ['burke', 'voltaire', 'paine']

result = sp.stats.chisquare(ct)

In [34]:
print((
    f"The Chi-Squared statistic is {round(result.statistic[0],2)}"
    f" and the p-value is {result.pvalue[0]:.3E}."
))

The Chi-Squared statistic is 17.02 and the p-value is 2.013E-04.


In [35]:
# Check only the positive, negative dimension
result = sp.stats.chisquare( ct[['positive', 'negative']] )
print((
    f"The Chi-Squared statistic is {round(result.statistic[0],2)}"
    f" and the p-value is {result.pvalue[0]:.3E}."
))

The Chi-Squared statistic is 17.02 and the p-value is 2.013E-04.


In [36]:
# check the non-positive,negative dimensions
result = sp.stats.chisquare( ct.drop(['positive', 'negative'], axis=1) )
print((
    f"The Chi-Squared statistic is {round(result.statistic[0],2)}"
    f" and the p-value is {result.pvalue[0]:.3E}."
))

The Chi-Squared statistic is 2.98 and the p-value is 2.252E-01.


None of these are strong enough to grant the claim.