In [1]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
#Make sure you update it before starting this notebook
import lucem_illud #pip install -U git+git://github.com/Computational-Content-Analysis-2018/lucem_illud.git

#All these packages need to be installed from pip
#For NLP
import nltk

import numpy as np #For arrays
import pandas as pd #Gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import seaborn #Makes the graphics look nicer

#Displays the graphs
import graphviz #You also need to install the command line graphviz

#These are from the standard library
import os.path
import zipfile
import subprocess
import io
import tempfile

%matplotlib inline

import sklearn

lucem_illud.setupStanfordNLP()

import lucem_illud.stanford as stanford

Starting downloads, this will take 5-10 minutes
..\stanford-NLP\parser already exists, skipping download
..\stanford-NLP\ner already exists, skipping download
..\stanford-NLP\postagger already exists, skipping download
..\stanford-NLP\core already exists, skipping download
[100%]Done setting up the Stanford NLP collection


The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


In [2]:
mech_soc_df = pd.read_pickle(r'C:\Users\Timot\Documents\final_project\soc_data\mechanisms\mech_soc_df.pk1')
mech_cog_df = pd.read_pickle(r'C:\Users\Timot\Documents\final_project\psych_data\mechanisms\mech_cog_df.pk1')
full_pos_df = pd.read_pickle(r'C:\Users\Timot\Documents\final_project\pos_df.pk1')

In [None]:
mech_soc_cog_df = mech_soc_df
full_df = mech_soc_cog_df.append(mech_cog_df)

In [None]:
full_df

In [3]:
#tokenize the sentences. 
mech_soc_df['sentences'] = mech_soc_df['contents'].apply(lambda x: [nltk.word_tokenize(s) for s in nltk.sent_tokenize(x)])
mech_cog_df['sentences'] = mech_cog_df['contents'].apply(lambda x: [nltk.word_tokenize(s) for s in nltk.sent_tokenize(x)])


In [4]:
#run the POS tagger over the full Sentences. 
mech_soc_df['POS_sents'] = mech_soc_df['sentences'].apply(lambda x: stanford.postTagger.tag_sents(x))
mech_cog_df['POS_sents'] = mech_cog_df['sentences'].apply(lambda x: stanford.postTagger.tag_sents(x))


KeyboardInterrupt: 

In [None]:
#saves data frames 
mech_soc_df.to_pickle(r'C:\Users\Timot\Documents\final_project\soc_data\mechanisms\mech_pos_soc_df.pk1') #saves

mech_cog_df.to_pickle(r'C:\Users\Timot\Documents\final_project\psych_data\mechanisms\mech_pos_cog_df.pk1') #saves

In [5]:
#loads dataframe 
mech_soc_df = pd.read_pickle(r'C:\Users\Timot\Documents\final_project\soc_data\mechanisms\mech_pos_soc_df.pk1')

mech_cog_df = pd.read_pickle(r'C:\Users\Timot\Documents\final_project\psych_data\mechanisms\mech_pos_cog_df.pk1')

In [None]:
len(mech_soc_df['POS_sents'].iloc[0])

In [None]:
#counts the nouns in the df
countTarget = 'JJ'
targetCounts = {}
for entry in mech_soc_df['POS_sents']:
    for sentence in entry:
        for ent, kind in sentence:
            if kind != countTarget:
                continue
            elif ent in targetCounts:
                targetCounts[ent] += 1
            else:
                targetCounts[ent] = 1
sortedTargets = sorted(targetCounts.items(), key = lambda x: x[1], reverse = True)
sortedTargets[:20]

In [7]:
#gives the adjectives that modify a given verb 
NTarget = 'JJ'
Word = 'mechanism'
NResults = set()
for entry in mech_cog_df['POS_sents']:
    for sentence in entry:
        for (ent1, kind1),(ent2,kind2) in zip(sentence[:-1], sentence[1:]):
            if (kind1,ent2.lower())==(NTarget,Word):
                NResults.add(ent1)
            else:
                continue

print(NResults) 

{'putative', 'underlying', 'alternative'}


In [None]:
#defines functions to display tree relations which display 
#the relationship between different types of speech 
def treeRelation(parsetree, relationType, *targets):
    if isinstance(parsetree, list):
        parsetree = parsetree[0]
    if set(targets) & set(parsetree.leaves()) != set(targets):
        return []
    else:
        retList = []
        for subT in parsetree.subtrees():
            if subT.label() == relationType:
                if set(targets) & set(subT.leaves()) == set(targets):
                    retList.append([(subT.label(), ' '.join(subT.leaves()))])
    return retList

In [None]:
def treeSubRelation(parsetree, relationTypeScope, relationTypeTarget, *targets):
    if isinstance(parsetree, list):
        parsetree = parsetree[0]
    if set(targets) & set(parsetree.leaves()) != set(targets):
        return []
    else:
        retSet = set()
        for subT in parsetree.subtrees():
            if set(targets) & set(subT.leaves()) == set(targets):
                if subT.label() == relationTypeScope:
                    for subsub in subT.subtrees():
                        if subsub.label()==relationTypeTarget:
                            retSet.add(' '.join(subsub.leaves()))
    return retSet

In [None]:
cogDepParse = list(stanford.depParser.parse_sents(mech_cog_df['sentences'][3]))

In [None]:
targetSentence = 187
print(' '.join(mech_cog_df['sentences'][3][targetSentence]))

In [None]:
try:
    graph = graphviz.Source(list(cogDepParse[targetSentence])[0].to_dot())
except IndexError:
    print("You likely have to rerun the depParses")
    raise
except:
    graph = None
    print("There was a problem with graphviz, likely your missing the program, https://www.graphviz.org/download/")
graph
graph.render(r'C:\Users\Timot\Documents\final_project\sentence3index187.gv', view=True)

In [7]:
count = 0
for i in mech_cog_df['sentences'][3]:
    if 'mechanism' in i: 
        print(i)
        print(count)
    count = count + 1

['However', ',', 'they', 'are', 'not', 'sufficient', 'for', 'the', 'conclusion', 'that', 'the', 'training', 'impacts', 'drinking', 'behaviour', 'through', 'the', 'theoretically', 'specified', 'putative', 'mechanism', 'of', 'changing', 'underlying', 'alcohol', 'action', 'tendency', '.']
14
['Evidence', 'for', 'the', 'mechanism', 'of', 'change', 'is', 'important', 'as', 'it', 'provides', 'a', 'basis', 'for', 'optimising', 'of', 'treatment', 'effects', 'and', 'for', 'ensuring', 'that', 'the', 'critical', 'features', 'of', 'the', 'procedure', 'are', 'maintained', 'in', 'clinical', 'practice', '[', '17', ']', '.Several', 'recent', 'reviews', 'have', 'highlighted', 'criteria', 'that', 'should', 'be', 'adopted', 'when', 'seeking', 'to', 'establish', 'the', 'mechanisms', 'of', 'change', 'that', 'underpin', 'treatment', 'effects', 'generally', '[', '17', ']', '–', '[', '19', ']', ',', 'and', 'for', 'training', 'paradigms', 'specifically', '[', '20', ']', '–', '[', '22', ']', '.']
15
['Second', 