### Gruppo 4 Text Analytics 2022/23
- Simona Sette
- Giulio Canapa
- Sara Quattrone
- Diego Borsetto

# Text Exploration

Investigating the contents and nature of the newspaper paragraphs texts. 

### Import

In [None]:
import pandas as pd
import csv
import spacy
from itertools import *
from spacy import displacy
from collections import Counter
import eng_spacysentiment

In [None]:
df = pd.read_csv("Multiclass_problem_7Classes.csv" ,sep=',', header=0)
df

Generation of dataframes containing persuasion techniques (y) and paragraph textual content along with other context data (X).

In [180]:
y_total=df['Technique']

In [181]:
#Select all columns except target classes
X_total=df.loc[:, df.columns != 'Technique']
X_total

Unnamed: 0,Article,Paragraph,Text,ID
0,111111111,3,Geneva - The World Health Organisation chief o...,0
1,111111111,13,"But Tedros voiced alarm that ""plague in Madaga...",1
2,111111111,17,He also pointed to the presence of the pneumon...,2
3,111111111,19,He praised the rapid response from WHO and Mad...,3
4,111111111,25,That means that Madagascar could be affected m...,4
...,...,...,...,...
1873,999001621,41,The story was completely false and the Guardia...,1873
1874,999001970,3,Andy Warhol was only half-right. In the future...,1874
1875,999001970,5,Saturday Night Live writer and comedian Nimesh...,1875
1876,999001970,6,That's what Columbia snowflakes thought was of...,1876


In [None]:
#Convert texts into lists 
X_total_lista = list()
for x in df['Text']:
    X_total_lista.append(str(x))  

--------------------------------------------------------------------

## Content Word Analysis

As part of our analysis we performed the count of the total content words within the paragraph, the total and percentage of unique content words, the total and percentage of names, verbs and adjectives (and unique ones) separately. 

This first section will focus on the extraction of these information regardless of the persuasion technique, while the following will consider each class separately (and will follow the same data preparation done for the general one).

In [183]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [184]:
total = "".join(X_total_lista)
total_text = nlp(total)
print("Total of tokens: ", len(total_text))

Total of tokens:  104659


In [185]:
# Counting the total of Content Words simultaneously and separately
content_words_total = [(token.lemma_,token.pos_) for token in total_text if token.pos_ in ["VERB","ADJ","NOUN"]]
noun_total = [(token.lemma_,token.pos_) for token in total_text if token.pos_ in ["NOUN"]]
verb_total = [(token.lemma_,token.pos_) for token in total_text if token.pos_ in ["VERB"]]
adj_total = [(token.lemma_,token.pos_) for token in total_text if token.pos_ in ["ADJ"]]

# Printing the total, percentage and unique Content Words simultaneously and separately
print("ALL CONTENT WORDS")
print("Content word total: ", len(content_words_total))
print("Percentage of content words: ", (len(content_words_total)/len(total_text)*100))
print("Unique content word total: ", len(set(content_words_total)))
print("Percentage of unique content words: ", len(set(content_words_total))/len(content_words_total)*100)
print("\nNOUNS")
print("Noun total: ", len(noun_total))
print("Percentage of nouns: ", (len(noun_total)/len(total_text)*100))
print("Unique noun total: ", len(set(noun_total)))
print("Percentage of unique nouns: ", len(set(noun_total))/len(noun_total)*100)
print("\nVERBS")
print("Verb total: ", len(verb_total))
print("Percentage of verbs: ", (len(verb_total)/len(total_text)*100))
print("Unique verb total: ", len(set(verb_total)))
print("Percentage of unique nouns: ", len(set(verb_total))/len(verb_total)*100)
print("\nADJECTIVES")
print("Adjective total: ", len(adj_total))
print("Percentage of adjectives: ", (len(adj_total)/len(total_text)*100))
print("Unique adjective total: ", len(set(adj_total)))
print("Percentage of unique adjectives: ", len(set(adj_total))/len(adj_total)*100)

ALL CONTENT WORDS
Content word total:  34960
Percentage of content words:  33.403720654697636
Unique content word total:  6745
Percentage of unique content words:  19.293478260869566

NOUNS
Noun total:  16940
Percentage of nouns:  16.185898967121794
Unique noun total:  3466
Percentage of unique nouns:  20.460448642266822

VERBS
Verb total:  11234
Percentage of verbs:  10.73390726072292
Unique verb total:  1643
Percentage of unique nouns:  14.62524479259391

ADJECTIVES
Adjective total:  6786
Percentage of adjectives:  6.483914426852923
Unique adjective total:  1636
Percentage of unique adjectives:  24.10845859121721


In [186]:
# Counting and printing the most common Content Words simultaneously and separately
count_content_words_total = Counter(content_words_total)
count_noun_total = Counter(noun_total)
count_verb_total = Counter(verb_total)
count_adj_total = Counter(adj_total)

print("MOST COMMON CONTENT WORDS")
print(count_content_words_total.most_common(31))
print("\nMOST COMMON NOUNS")
print(count_noun_total.most_common(20))
print("\nMOST COMMON VERBS")
print(count_verb_total.most_common(20))
print("\nMOST COMMON ADJECTIVES")
print(count_adj_total.most_common(21))

MOST COMMON CONTENT WORDS
[(('say', 'VERB'), 506), (('have', 'VERB'), 301), (('make', 'VERB'), 185), (('do', 'VERB'), 180), (('go', 'VERB'), 173), (('year', 'NOUN'), 170), (('take', 'VERB'), 161), (('people', 'NOUN'), 155), (('know', 'VERB'), 145), (('time', 'NOUN'), 138), (('tell', 'VERB'), 129), (('be', 'VERB'), 127), (('other', 'ADJ'), 125), (('come', 'VERB'), 120), (('man', 'NOUN'), 118), (('give', 'VERB'), 117), (('see', 'VERB'), 109), (('think', 'VERB'), 108), (('get', 'VERB'), 102), (('-', 'ADJ'), 101), (('case', 'NOUN'), 96), (('many', 'ADJ'), 95), (('report', 'NOUN'), 94), (('fact', 'NOUN'), 89), (('american', 'ADJ'), 88), (('former', 'ADJ'), 86), (('more', 'ADJ'), 83), (('call', 'VERB'), 82), (('’', 'VERB'), 82), (('include', 'VERB'), 82), (('write', 'VERB'), 81)]

MOST COMMON NOUNS
[(('year', 'NOUN'), 170), (('people', 'NOUN'), 155), (('time', 'NOUN'), 138), (('man', 'NOUN'), 118), (('case', 'NOUN'), 96), (('report', 'NOUN'), 94), (('fact', 'NOUN'), 89), (('law', 'NOUN'), 80

### Loaded_Language

This first section will focus on the extraction of the information described above focusing on the Loaded_Language class.

In [187]:
df_loaded_language = df.loc[(df['Technique'] == "Loaded_Language")]
df_loaded_language

Unnamed: 0,Article,Paragraph,Technique,Text,ID
5,111111112,15,Loaded_Language,On both of their blogs the pair called their b...,5
9,111111112,30,Loaded_Language,"A researcher with the organisation, Matthew Co...",9
10,111111113,10,Loaded_Language,Lead attorney Matt Gonzalez has argued that th...,10
13,111111115,9,Loaded_Language,"Leeann Tweeden, a radio news anchor, says Fran...",13
14,111111115,13,Loaded_Language,"A woman described as a ""former elected officia...",14
...,...,...,...,...,...
1845,999001619,8,Loaded_Language,"Nonetheless, this unverified allegation has be...",1845
1848,999001619,16,Loaded_Language,"For the best part of a decade, any claims by A...",1848
1857,999001619,37,Loaded_Language,"Glenn Greenwald, who once had an influential c...",1857
1867,999001621,16,Loaded_Language,The new sensational claim was immediately pick...,1867


In [188]:
X_loaded_language = list()
for x in df_loaded_language['Text']:
    X_loaded_language.append(str(x))  

In [189]:
doc_loaded_language = list()
for i in X_loaded_language:
    doc_loaded_language.append(nlp(i))

In [190]:
total_loaded = "".join(X_loaded_language)
total_loaded_text = nlp(total_loaded)
print("Total of tokens: ", len(total_loaded_text))

Total of tokens:  46782


In [191]:
content_words_total_loaded = [(token.lemma_,token.pos_) for token in total_loaded_text if token.pos_ in ["VERB","ADJ","NOUN"]]
noun_total_loaded = [(token.lemma_,token.pos_) for token in total_loaded_text if token.pos_ in ["NOUN"]]
verb_total_loaded = [(token.lemma_,token.pos_) for token in total_loaded_text if token.pos_ in ["VERB"]]
adj_total_loaded = [(token.lemma_,token.pos_) for token in total_loaded_text if token.pos_ in ["ADJ"]]

print("ALL CONTENT WORDS")
print("Content word total: ", len(content_words_total_loaded))
print("Percentage of content words: ", (len(content_words_total_loaded)/len(total_loaded_text)*100))
print("Unique content word total: ", len(set(content_words_total_loaded)))
print("Percentage of unique content words: ", len(set(content_words_total_loaded))/len(content_words_total_loaded)*100)
print("\nNOUNS")
print("Noun total: ", len(noun_total_loaded))
print("Percentage of nouns: ", (len(noun_total_loaded)/len(total_loaded_text)*100))
print("Unique noun total: ", len(set(noun_total_loaded)))
print("Percentage of unique nouns: ", len(set(noun_total_loaded))/len(noun_total_loaded)*100)
print("\nVERBS")
print("Verb total: ", len(verb_total_loaded))
print("Percentage of verbs: ", (len(verb_total_loaded)/len(total_loaded_text)*100))
print("Unique verb total: ", len(set(verb_total_loaded)))
print("Percentage of unique nouns: ", len(set(verb_total_loaded))/len(verb_total_loaded)*100)
print("\nADJECTIVES")
print("Adjective total: ", len(adj_total_loaded))
print("Percentage of adjectives: ", (len(adj_total_loaded)/len(total_loaded_text)*100))
print("Unique adjective total: ", len(set(adj_total_loaded)))
print("Percentage of unique adjectives: ", len(set(adj_total_loaded))/len(adj_total_loaded)*100)

ALL CONTENT WORDS
Content word total:  15701
Percentage of content words:  33.5620537813689
Unique content word total:  4661
Percentage of unique content words:  29.686007260684033

NOUNS
Noun total:  7554
Percentage of nouns:  16.147236116455048
Unique noun total:  2325
Percentage of unique nouns:  30.778395552025415

VERBS
Verb total:  4956
Percentage of verbs:  10.593818135180197
Unique verb total:  1213
Percentage of unique nouns:  24.47538337368846

ADJECTIVES
Adjective total:  3191
Percentage of adjectives:  6.820999529733658
Unique adjective total:  1123
Percentage of unique adjectives:  35.19272955186462


In [192]:
count_content_words_total_loaded = Counter(content_words_total_loaded)
count_noun_total_loaded = Counter(noun_total_loaded)
count_verb_total_loaded = Counter(verb_total_loaded)
count_adj_total_loaded = Counter(adj_total_loaded)

print("MOST COMMON CONTENT WORDS")
print(count_content_words_total_loaded.most_common(31))
print("\nMOST COMMON NOUNS")
print(count_noun_total_loaded.most_common(20))
print("\nMOST COMMON VERBS")
print(count_verb_total_loaded.most_common(20))
print("\nMOST COMMON ADJECTIVES")
print(count_adj_total_loaded.most_common(21))

MOST COMMON CONTENT WORDS
[(('say', 'VERB'), 211), (('have', 'VERB'), 124), (('make', 'VERB'), 87), (('year', 'NOUN'), 76), (('time', 'NOUN'), 75), (('do', 'VERB'), 73), (('go', 'VERB'), 72), (('take', 'VERB'), 66), (('be', 'VERB'), 62), (('people', 'NOUN'), 61), (('other', 'ADJ'), 58), (('give', 'VERB'), 54), (('tell', 'VERB'), 54), (('know', 'VERB'), 53), (('come', 'VERB'), 53), (('man', 'NOUN'), 50), (('see', 'VERB'), 46), (('-', 'ADJ'), 45), (('law', 'NOUN'), 44), (('former', 'ADJ'), 43), (('fact', 'NOUN'), 43), (('public', 'ADJ'), 43), (('new', 'ADJ'), 41), (('continue', 'VERB'), 40), (('get', 'VERB'), 40), (('investigation', 'NOUN'), 40), (('abuse', 'NOUN'), 39), (('think', 'VERB'), 38), (('many', 'ADJ'), 38), (('part', 'NOUN'), 38), (('write', 'VERB'), 37)]

MOST COMMON NOUNS
[(('year', 'NOUN'), 76), (('time', 'NOUN'), 75), (('people', 'NOUN'), 61), (('man', 'NOUN'), 50), (('law', 'NOUN'), 44), (('fact', 'NOUN'), 43), (('investigation', 'NOUN'), 40), (('abuse', 'NOUN'), 39), (('

### Name_Calling-Labeling

This first section will focus on the extraction of the information described above focusing on the Name_Calling-Labeling class.

In [193]:
df_name_calling = df.loc[(df['Technique'] == "Name_Calling-Labeling")]
df_name_calling

Unnamed: 0,Article,Paragraph,Technique,Text,ID
7,111111112,23,Name_Calling-Labeling,It's embarrassing for this so-called land of d...,7
8,111111112,25,Name_Calling-Labeling,How many hate preachers are living in this cou...,8
11,111111113,19,Name_Calling-Labeling,"San Francisco prosecutors, who had long ago de...",11
20,111111122,31,Name_Calling-Labeling,Some have speculated that Kavanaugh would be e...,20
24,111111124,5,Name_Calling-Labeling,The documents Trump ordered declassified invol...,24
...,...,...,...,...,...
1862,999001619,57,Name_Calling-Labeling,"That clear partisanship should be no surprise,...",1862
1863,999001619,62,Name_Calling-Labeling,UPDATE: Excellent background from investigativ...,1863
1871,999001621,34,Name_Calling-Labeling,In a series of tweets WikiLeaks said Assange a...,1871
1872,999001621,40,Name_Calling-Labeling,A day after the Guardian smear piece the Washi...,1872


In [194]:
X_name_calling = list()
for x in df_name_calling['Text']:
    X_name_calling.append(str(x))  

In [195]:
doc_name_calling = list()
for i in X_name_calling:
    doc_name_calling.append(nlp(i))

In [196]:
total_calling = "".join(X_name_calling)
total_calling_text = nlp(total_calling)
print("Total of tokens: ", len(total_calling_text))

Total of tokens:  15846


In [197]:
content_words_total_calling = [(token.lemma_,token.pos_) for token in total_calling_text if token.pos_ in ["VERB","ADJ","NOUN"]]
noun_total_calling = [(token.lemma_,token.pos_) for token in total_calling_text if token.pos_ in ["NOUN"]]
verb_total_calling = [(token.lemma_,token.pos_) for token in total_calling_text if token.pos_ in ["VERB"]]
adj_total_calling = [(token.lemma_,token.pos_) for token in total_calling_text if token.pos_ in ["ADJ"]]

print("ALL CONTENT WORDS")
print("Content word total: ", len(content_words_total_calling))
print("Percentage of content words: ", (len(content_words_total_calling)/len(total_calling_text)*100))
print("Unique content word total: ", len(set(content_words_total_calling)))
print("Percentage of unique content words: ", len(set(content_words_total_calling))/len(content_words_total_calling)*100)
print("\nNOUNS")
print("Noun total: ", len(noun_total_calling))
print("Percentage of nouns: ", (len(noun_total_calling)/len(total_calling_text)*100))
print("Unique noun total: ", len(set(noun_total_calling)))
print("Percentage of unique nouns: ", len(set(noun_total_calling))/len(noun_total_calling)*100)
print("\nVERBS")
print("Verb total: ", len(verb_total_calling))
print("Percentage of verbs: ", (len(verb_total_calling)/len(total_calling_text)*100))
print("Unique verb total: ", len(set(verb_total_calling)))
print("Percentage of unique nouns: ", len(set(verb_total_calling))/len(verb_total_calling)*100)
print("\nADJECTIVES")
print("Adjective total: ", len(adj_total_calling))
print("Percentage of adjectives: ", (len(adj_total_calling)/len(total_calling_text)*100))
print("Unique adjective total: ", len(set(adj_total_calling)))
print("Percentage of unique adjectives: ", len(set(adj_total_calling))/len(adj_total_calling)*100)

ALL CONTENT WORDS
Content word total:  5165
Percentage of content words:  32.59497665025874
Unique content word total:  2251
Percentage of unique content words:  43.58180058083253

NOUNS
Noun total:  2504
Percentage of nouns:  15.802095165972485
Unique noun total:  1141
Percentage of unique nouns:  45.56709265175719

VERBS
Verb total:  1637
Percentage of verbs:  10.330682822163322
Unique verb total:  612
Percentage of unique nouns:  37.38546120952962

ADJECTIVES
Adjective total:  1024
Percentage of adjectives:  6.462198662122934
Unique adjective total:  498
Percentage of unique adjectives:  48.6328125


In [198]:
count_content_words_total_calling = Counter(content_words_total_calling)
count_noun_total_calling = Counter(noun_total_calling)
count_verb_total_calling = Counter(verb_total_calling)
count_adj_total_calling = Counter(adj_total_calling)

print("MOST COMMON CONTENT WORDS")
print(count_content_words_total_calling.most_common(31))
print("\nMOST COMMON NOUNS")
print(count_noun_total_calling.most_common(20))
print("\nMOST COMMON VERBS")
print(count_verb_total_calling.most_common(20))
print("\nMOST COMMON ADJECTIVES")
print(count_adj_total_calling.most_common(21))

MOST COMMON CONTENT WORDS
[(('say', 'VERB'), 54), (('have', 'VERB'), 36), (('year', 'NOUN'), 30), (('-', 'ADJ'), 27), (('make', 'VERB'), 26), (('do', 'VERB'), 25), (('former', 'ADJ'), 24), (('man', 'NOUN'), 23), (('take', 'VERB'), 23), (('think', 'VERB'), 21), (('other', 'ADJ'), 20), (('go', 'VERB'), 20), (('call', 'VERB'), 18), (('know', 'VERB'), 18), (('member', 'NOUN'), 18), (('priest', 'NOUN'), 18), (('time', 'NOUN'), 17), (('document', 'NOUN'), 16), (('tell', 'VERB'), 16), (('authority', 'NOUN'), 15), (('come', 'VERB'), 15), (('sexual', 'ADJ'), 15), (('find', 'VERB'), 15), (('child', 'NOUN'), 15), (('message', 'NOUN'), 14), (('give', 'VERB'), 14), (('medium', 'NOUN'), 14), (('try', 'VERB'), 14), (('president', 'NOUN'), 13), (('believe', 'VERB'), 13), (('include', 'VERB'), 13)]

MOST COMMON NOUNS
[(('year', 'NOUN'), 30), (('man', 'NOUN'), 23), (('member', 'NOUN'), 18), (('priest', 'NOUN'), 18), (('time', 'NOUN'), 17), (('document', 'NOUN'), 16), (('authority', 'NOUN'), 15), (('chil

### Repetition

This first section will focus on the extraction of the information described above focusing on the Repetition class.

In [199]:
df_repetition = df.loc[(df['Technique'] == "Repetition")]
df_repetition

Unnamed: 0,Article,Paragraph,Technique,Text,ID
1,111111111,13,Repetition,"But Tedros voiced alarm that ""plague in Madaga...",1
39,111111131,26,Repetition,"He is very talented, Trump said, citing Kim's ...",39
40,111111131,27,Repetition,Kim assumed power after his father Kim Jong Il...,40
43,111111131,42,Repetition,Kim was cheered by onlookers who caught sight ...,43
61,111111134,11,Repetition,“I have never met Julian Assange or anyone con...,61
...,...,...,...,...,...
1813,999001280,16,Repetition,Every media outlet from the Associated Press t...,1813
1814,999001280,17,Repetition,"But it's more than that, self-proclaimed ""fact...",1814
1825,999001290,21,Repetition,In speaking about the rules and regulations th...,1825
1834,999001297,13,Repetition,The first group of the migrant caravan arrived...,1834


In [200]:
X_repetition = list()
for x in df_repetition['Text']:
    X_repetition.append(str(x))

In [201]:
doc_repetition = list()
for i in X_repetition:
    doc_repetition.append(nlp(i))

In [202]:
total_repetition = "".join(X_repetition)
total_repetition_text = nlp(total_repetition)
print("Total of tokens: ", len(total_repetition_text))

Total of tokens:  12820


In [203]:
content_words_total_repetition = [(token.lemma_,token.pos_) for token in total_repetition_text if token.pos_ in ["VERB","ADJ","NOUN"]]
noun_total_repetition = [(token.lemma_,token.pos_) for token in total_repetition_text if token.pos_ in ["NOUN"]]
verb_total_repetition = [(token.lemma_,token.pos_) for token in total_repetition_text if token.pos_ in ["VERB"]]
adj_total_repetition = [(token.lemma_,token.pos_) for token in total_repetition_text if token.pos_ in ["ADJ"]]

print("ALL CONTENT WORDS")
print("Content word total: ", len(content_words_total_repetition))
print("Percentage of content words: ", (len(content_words_total_repetition)/len(total_repetition_text)*100))
print("Unique content word total: ", len(set(content_words_total_repetition)))
print("Percentage of unique content words: ", len(set(content_words_total_repetition))/len(content_words_total_repetition)*100)
print("\nNOUNS")
print("Noun total: ", len(noun_total_repetition))
print("Percentage of nouns: ", (len(noun_total_repetition)/len(total_repetition_text)*100))
print("Unique noun total: ", len(set(noun_total_repetition)))
print("Percentage of unique nouns: ", len(set(noun_total_repetition))/len(noun_total_repetition)*100)
print("\nVERBS")
print("Verb total: ", len(verb_total_repetition))
print("Percentage of verbs: ", (len(verb_total_repetition)/len(total_repetition_text)*100))
print("Unique verb total: ", len(set(verb_total_repetition)))
print("Percentage of unique nouns: ", len(set(verb_total_repetition))/len(verb_total_repetition)*100)
print("\nADJECTIVES")
print("Adjective total: ", len(adj_total_repetition))
print("Percentage of adjectives: ", (len(adj_total_repetition)/len(total_repetition_text)*100))
print("Unique adjective total: ", len(set(adj_total_repetition)))
print("Percentage of unique adjectives: ", len(set(adj_total_repetition))/len(adj_total_repetition)*100)

ALL CONTENT WORDS
Content word total:  4336
Percentage of content words:  33.82215288611544
Unique content word total:  1811
Percentage of unique content words:  41.76660516605166

NOUNS
Noun total:  2148
Percentage of nouns:  16.755070202808113
Unique noun total:  936
Percentage of unique nouns:  43.575418994413404

VERBS
Verb total:  1352
Percentage of verbs:  10.546021840873635
Unique verb total:  482
Percentage of unique nouns:  35.650887573964496

ADJECTIVES
Adjective total:  836
Percentage of adjectives:  6.521060842433697
Unique adjective total:  393
Percentage of unique adjectives:  47.00956937799043


In [204]:
count_content_words_total_repetition = Counter(content_words_total_repetition)
count_noun_total_repetition = Counter(noun_total_repetition)
count_verb_total_repetition = Counter(verb_total_repetition)
count_adj_total_repetition = Counter(adj_total_repetition)

print("MOST COMMON CONTENT WORDS")
print(count_content_words_total_repetition.most_common(31))
print("\nMOST COMMON NOUNS")
print(count_noun_total_repetition.most_common(20))
print("\nMOST COMMON VERBS")
print(count_verb_total_repetition.most_common(20))
print("\nMOST COMMON ADJECTIVES")
print(count_adj_total_repetition.most_common(21))

MOST COMMON CONTENT WORDS
[(('say', 'VERB'), 68), (('have', 'VERB'), 45), (('year', 'NOUN'), 27), (('do', 'VERB'), 25), (('people', 'NOUN'), 24), (('go', 'VERB'), 24), (('make', 'VERB'), 23), (('know', 'VERB'), 23), (('be', 'VERB'), 21), (('other', 'ADJ'), 21), (('autonomy', 'NOUN'), 21), (('think', 'VERB'), 20), (('man', 'NOUN'), 19), (('school', 'NOUN'), 19), (('monastery', 'NOUN'), 19), (('life', 'NOUN'), 18), (('parent', 'NOUN'), 18), (('take', 'VERB'), 17), (('come', 'VERB'), 17), (('get', 'VERB'), 16), (('case', 'NOUN'), 16), (('see', 'VERB'), 16), (('illegal', 'ADJ'), 16), (('alien', 'NOUN'), 16), (('give', 'VERB'), 16), (('priest', 'NOUN'), 16), (('-', 'ADJ'), 15), (('tell', 'VERB'), 15), (('last', 'ADJ'), 14), (('fact', 'NOUN'), 14), (('evidence', 'NOUN'), 14)]

MOST COMMON NOUNS
[(('year', 'NOUN'), 27), (('people', 'NOUN'), 24), (('autonomy', 'NOUN'), 21), (('man', 'NOUN'), 19), (('school', 'NOUN'), 19), (('monastery', 'NOUN'), 19), (('life', 'NOUN'), 18), (('parent', 'NOUN')

### Doubt

This first section will focus on the extraction of the information described above focusing on the Doubt class.

In [205]:
df_doubt = df.loc[(df['Technique'] == "Doubt")]
df_doubt

Unnamed: 0,Article,Paragraph,Technique,Text,ID
0,111111111,3,Doubt,Geneva - The World Health Organisation chief o...,0
18,111111122,21,Doubt,It’s also the case that the more negative info...,18
66,111111135,28,Doubt,Why were Tamika Mallory and Danny Davis reluct...,66
87,694356862,11,Doubt,Whether the Trump administration follows throu...,87
88,694811415,1,Doubt,“It’s gotta be a set-up”: Neighbor of Las Vega...,88
...,...,...,...,...,...
1861,999001619,47,Doubt,"Similarly, the Guardian worked tirelessly to p...",1861
1864,999001621,1,Doubt,This Guardian Fake News Story Proves That The ...,1864
1865,999001621,4,Doubt,This Guardian Fake News Story Proves That The ...,1865
1868,999001621,18,Doubt,The story was weakly sourced and included some...,1868


In [206]:
X_doubt = list()
for x in df_doubt['Text']:
    X_doubt.append(str(x))

In [207]:
doc_doubt = list()
for i in X_doubt:
    doc_doubt.append(nlp(i))

In [208]:
total_doubt = "".join(X_doubt)
total_doubt_text = nlp(total_doubt)
print("Total of tokens: ", len(total_doubt_text))

Total of tokens:  11722


In [209]:
content_words_total_doubt = [(token.lemma_,token.pos_) for token in total_doubt_text if token.pos_ in ["VERB","ADJ","NOUN"]]
noun_total_doubt = [(token.lemma_,token.pos_) for token in total_doubt_text if token.pos_ in ["NOUN"]]
verb_total_doubt = [(token.lemma_,token.pos_) for token in total_doubt_text if token.pos_ in ["VERB"]]
adj_total_doubt = [(token.lemma_,token.pos_) for token in total_doubt_text if token.pos_ in ["ADJ"]]

print("ALL CONTENT WORDS")
print("Content word total: ", len(content_words_total_doubt))
print("Percentage of content words: ", (len(content_words_total_doubt)/len(total_doubt_text)*100))
print("Unique content word total: ", len(set(content_words_total_doubt)))
print("Percentage of unique content words: ", len(set(content_words_total_doubt))/len(content_words_total_doubt)*100)
print("\nNOUNS")
print("Noun total: ", len(noun_total_doubt))
print("Percentage of nouns: ", (len(noun_total_doubt)/len(total_doubt_text)*100))
print("Unique noun total: ", len(set(noun_total_doubt)))
print("Percentage of unique nouns: ", len(set(noun_total_doubt))/len(noun_total_doubt)*100)
print("\nVERBS")
print("Verb total: ", len(verb_total_doubt))
print("Percentage of verbs: ", (len(verb_total_doubt)/len(total_doubt_text)*100))
print("Unique verb total: ", len(set(verb_total_doubt)))
print("Percentage of unique nouns: ", len(set(verb_total_doubt))/len(verb_total_doubt)*100)
print("\nADJECTIVES")
print("Adjective total: ", len(adj_total_doubt))
print("Percentage of adjectives: ", (len(adj_total_doubt)/len(total_doubt_text)*100))
print("Unique adjective total: ", len(set(adj_total_doubt)))
print("Percentage of unique adjectives: ", len(set(adj_total_doubt))/len(adj_total_doubt)*100)

ALL CONTENT WORDS
Content word total:  3929
Percentage of content words:  33.51817096058693
Unique content word total:  1601
Percentage of unique content words:  40.74828200559939

NOUNS
Noun total:  1933
Percentage of nouns:  16.490360006824776
Unique noun total:  799
Percentage of unique nouns:  41.3347128815313

VERBS
Verb total:  1376
Percentage of verbs:  11.738611158505375
Unique verb total:  475
Percentage of unique nouns:  34.520348837209305

ADJECTIVES
Adjective total:  620
Percentage of adjectives:  5.289199795256782
Unique adjective total:  327
Percentage of unique adjectives:  52.741935483870975


In [210]:
count_content_words_total_doubt = Counter(content_words_total_doubt)
count_noun_total_doubt = Counter(noun_total_doubt)
count_verb_total_doubt = Counter(verb_total_doubt)
count_adj_total_doubt = Counter(adj_total_doubt)

print("MOST COMMON CONTENT WORDS")
print(count_content_words_total_doubt.most_common(31))
print("\nMOST COMMON NOUNS")
print(count_noun_total_doubt.most_common(20))
print("\nMOST COMMON VERBS")
print(count_verb_total_doubt.most_common(20))
print("\nMOST COMMON ADJECTIVES")
print(count_adj_total_doubt.most_common(21))

MOST COMMON CONTENT WORDS
[(('say', 'VERB'), 76), (('have', 'VERB'), 41), (('police', 'NOUN'), 28), (('know', 'VERB'), 27), (('report', 'NOUN'), 25), (('do', 'VERB'), 22), (('shooting', 'NOUN'), 22), (('tell', 'VERB'), 22), (('take', 'VERB'), 21), (('go', 'VERB'), 18), (('see', 'VERB'), 17), (('question', 'NOUN'), 17), (('p.m.', 'NOUN'), 17), (('time', 'NOUN'), 17), (('case', 'NOUN'), 16), (('make', 'VERB'), 16), (('give', 'VERB'), 16), (('fail', 'VERB'), 15), (('guard', 'NOUN'), 15), (('be', 'VERB'), 15), (('ask', 'VERB'), 15), (('’', 'VERB'), 14), (('information', 'NOUN'), 14), (('day', 'NOUN'), 14), (('investigation', 'NOUN'), 14), (('evidence', 'NOUN'), 14), (('hotel', 'NOUN'), 14), (('security', 'NOUN'), 14), (('provide', 'VERB'), 14), (('get', 'VERB'), 14), (('medium', 'NOUN'), 13)]

MOST COMMON NOUNS
[(('police', 'NOUN'), 28), (('report', 'NOUN'), 25), (('shooting', 'NOUN'), 22), (('question', 'NOUN'), 17), (('p.m.', 'NOUN'), 17), (('time', 'NOUN'), 17), (('case', 'NOUN'), 16), 

### Appeal_to_Fear-Prejudice

This first section will focus on the extraction of the information described above focusing on the Appeal_to_Fear-Prejudice class.

In [211]:
df_prejudice = df.loc[(df['Technique'] == "Appeal_to_Fear-Prejudice")]
df_prejudice

Unnamed: 0,Article,Paragraph,Technique,Text,ID
2,111111111,17,Appeal_to_Fear-Prejudice,He also pointed to the presence of the pneumon...,2
3,111111111,19,Appeal_to_Fear-Prejudice,He praised the rapid response from WHO and Mad...,3
4,111111111,25,Appeal_to_Fear-Prejudice,That means that Madagascar could be affected m...,4
12,111111114,25,Appeal_to_Fear-Prejudice,Members of the group and advocates say they fe...,12
82,694327499,37,Appeal_to_Fear-Prejudice,"In a sense, what Bergoglio is doing is worse t...",82
...,...,...,...,...,...
1730,999000159,12,Appeal_to_Fear-Prejudice,So it is crucial that you and your pro-gun fri...,1730
1738,999000565,6,Appeal_to_Fear-Prejudice,Watch this leftist loon pour her beverage on F...,1738
1739,999000565,7,Appeal_to_Fear-Prejudice,WATCH: Angry leftist pours her beverage on FSU...,1739
1767,999000874,15,Appeal_to_Fear-Prejudice,"“If left unchallenged, the actions of the Whit...",1767


In [212]:
X_prejudice = list()
for x in df_prejudice['Text']:
    X_prejudice.append(str(x))

In [213]:
doc_prejudice = list()
for i in X_prejudice:
    doc_prejudice.append(nlp(i))

In [214]:
total_prejudice = "".join(X_prejudice)
total_prejudice_text = nlp(total_prejudice)
print("Total of tokens: ", len(total_prejudice_text))

Total of tokens:  6748


In [215]:
content_words_total_prejudice = [(token.lemma_,token.pos_) for token in total_prejudice_text if token.pos_ in ["VERB","ADJ","NOUN"]]
noun_total_prejudice = [(token.lemma_,token.pos_) for token in total_prejudice_text if token.pos_ in ["NOUN"]]
verb_total_prejudice = [(token.lemma_,token.pos_) for token in total_prejudice_text if token.pos_ in ["VERB"]]
adj_total_prejudice = [(token.lemma_,token.pos_) for token in total_prejudice_text if token.pos_ in ["ADJ"]]

print("ALL CONTENT WORDS")
print("Content word total: ", len(content_words_total_prejudice))
print("Percentage of content words: ", (len(content_words_total_prejudice)/len(total_prejudice_text)*100))
print("Unique content word total: ", len(set(content_words_total_prejudice)))
print("Percentage of unique content words: ", len(set(content_words_total_prejudice))/len(content_words_total_prejudice)*100)
print("\nNOUNS")
print("Noun total: ", len(noun_total_prejudice))
print("Percentage of nouns: ", (len(noun_total_prejudice)/len(total_prejudice_text)*100))
print("Unique noun total: ", len(set(noun_total_prejudice)))
print("Percentage of unique nouns: ", len(set(noun_total_prejudice))/len(noun_total_prejudice)*100)
print("\nVERBS")
print("Verb total: ", len(verb_total_prejudice))
print("Percentage of verbs: ", (len(verb_total_prejudice)/len(total_prejudice_text)*100))
print("Unique verb total: ", len(set(verb_total_prejudice)))
print("Percentage of unique nouns: ", len(set(verb_total_prejudice))/len(verb_total_prejudice)*100)
print("\nADJECTIVES")
print("Adjective total: ", len(adj_total_prejudice))
print("Percentage of adjectives: ", (len(adj_total_prejudice)/len(total_prejudice_text)*100))
print("Unique adjective total: ", len(set(adj_total_prejudice)))
print("Percentage of unique adjectives: ", len(set(adj_total_prejudice))/len(adj_total_prejudice)*100)

ALL CONTENT WORDS
Content word total:  2231
Percentage of content words:  33.06164789567279
Unique content word total:  1171
Percentage of unique content words:  52.48767368892872

NOUNS
Noun total:  1043
Percentage of nouns:  15.45643153526971
Unique noun total:  597
Percentage of unique nouns:  57.238734419942475

VERBS
Verb total:  749
Percentage of verbs:  11.099585062240664
Unique verb total:  341
Percentage of unique nouns:  45.52736982643525

ADJECTIVES
Adjective total:  439
Percentage of adjectives:  6.505631298162419
Unique adjective total:  233
Percentage of unique adjectives:  53.075170842824605


In [216]:
count_content_words_total_prejudice = Counter(content_words_total_prejudice)
count_noun_total_prejudice = Counter(noun_total_prejudice)
count_verb_total_prejudice = Counter(verb_total_prejudice)
count_adj_total_prejudice = Counter(adj_total_prejudice)

print("MOST COMMON CONTENT WORDS")
print(count_content_words_total_prejudice.most_common(31))
print("\nMOST COMMON NOUNS")
print(count_noun_total_prejudice.most_common(20))
print("\nMOST COMMON VERBS")
print(count_verb_total_prejudice.most_common(20))
print("\nMOST COMMON ADJECTIVES")
print(count_adj_total_prejudice.most_common(21))

MOST COMMON CONTENT WORDS
[(('say', 'VERB'), 36), (('have', 'VERB'), 19), (('do', 'VERB'), 18), (('go', 'VERB'), 18), (('many', 'ADJ'), 15), (('disease', 'NOUN'), 13), (('spread', 'VERB'), 12), (('country', 'NOUN'), 12), (('world', 'NOUN'), 11), (('ally', 'NOUN'), 11), (('take', 'VERB'), 11), (('people', 'NOUN'), 11), (('get', 'VERB'), 10), (('nuclear', 'ADJ'), 10), (('year', 'NOUN'), 10), (('continue', 'VERB'), 9), (('be', 'VERB'), 9), (('know', 'VERB'), 9), (('kill', 'VERB'), 9), (('enter', 'VERB'), 9), (('want', 'VERB'), 8), (('write', 'VERB'), 8), (('outbreak', 'NOUN'), 7), (('large', 'ADJ'), 7), (('deal', 'NOUN'), 7), (('military', 'ADJ'), 7), (('case', 'NOUN'), 7), (('warn', 'VERB'), 6), (('heresy', 'NOUN'), 6), (('call', 'VERB'), 6), (('issue', 'VERB'), 6)]

MOST COMMON NOUNS
[(('disease', 'NOUN'), 13), (('country', 'NOUN'), 12), (('world', 'NOUN'), 11), (('ally', 'NOUN'), 11), (('people', 'NOUN'), 11), (('year', 'NOUN'), 10), (('outbreak', 'NOUN'), 7), (('deal', 'NOUN'), 7), ((

### Exaggeration-Minimisation 

This first section will focus on the extraction of the information described above focusing on the Exaggeration-Minimisation class.

In [217]:
df_ex_min = df.loc[(df['Technique'] == "Exaggeration-Minimisation")]
df_ex_min

Unnamed: 0,Article,Paragraph,Technique,Text,ID
17,111111122,19,Exaggeration-Minimisation,"For one thing, the president has made this an ...",17
27,111111124,24,Exaggeration-Minimisation,The source added that the Justice Department i...,27
62,111111134,15,Exaggeration-Minimisation,And WikiLeaks said on Twitter that it was “wil...,62
69,111111135,48,Exaggeration-Minimisation,"Representative Davis, who was interviewed by T...",69
73,111111137,3,Exaggeration-Minimisation,New gun control measures for Florida have pass...,73
...,...,...,...,...,...
1828,999001290,27,Exaggeration-Minimisation,"Well, I can't wait to see the next press confe...",1828
1866,999001621,13,Exaggeration-Minimisation,Manafort held secret talks with Assange in Ecu...,1866
1870,999001621,32,Exaggeration-Minimisation,"Manafort, 69, denies involvement in the hack a...",1870
1875,999001970,5,Exaggeration-Minimisation,Saturday Night Live writer and comedian Nimesh...,1875


In [218]:
X_ex_min = list()
for x in df_ex_min['Text']:
    X_ex_min.append(str(x))

In [219]:
doc_ex_min = list()
for i in X_ex_min:
    doc_ex_min.append(nlp(i))

In [220]:
total_ex_min = "".join(X_ex_min)
total_ex_min_text = nlp(total_ex_min)
print("Total of tokens: ", len(total_ex_min_text))

Total of tokens:  5613


In [221]:
content_words_total_ex_min = [(token.lemma_,token.pos_) for token in total_ex_min_text if token.pos_ in ["VERB","ADJ","NOUN"]]
noun_total_ex_min = [(token.lemma_,token.pos_) for token in total_ex_min_text if token.pos_ in ["NOUN"]]
verb_total_ex_min = [(token.lemma_,token.pos_) for token in total_ex_min_text if token.pos_ in ["VERB"]]
adj_total_ex_min = [(token.lemma_,token.pos_) for token in total_ex_min_text if token.pos_ in ["ADJ"]]

print("ALL CONTENT WORDS")
print("Content word total: ", len(content_words_total_ex_min))
print("Percentage of content words: ", (len(content_words_total_ex_min)/len(total_ex_min_text)*100))
print("Unique content word total: ", len(set(content_words_total_ex_min)))
print("Percentage of unique content words: ", len(set(content_words_total_ex_min))/len(content_words_total_ex_min)*100)
print("\nNOUNS")
print("Noun total: ", len(noun_total_ex_min))
print("Percentage of nouns: ", (len(noun_total_ex_min)/len(total_ex_min_text)*100))
print("Unique noun total: ", len(set(noun_total_ex_min)))
print("Percentage of unique nouns: ", len(set(noun_total_ex_min))/len(noun_total_ex_min)*100)
print("\nVERBS")
print("Verb total: ", len(verb_total_ex_min))
print("Percentage of verbs: ", (len(verb_total_ex_min)/len(total_ex_min_text)*100))
print("Unique verb total: ", len(set(verb_total_ex_min)))
print("Percentage of unique nouns: ", len(set(verb_total_ex_min))/len(verb_total_ex_min)*100)
print("\nADJECTIVES")
print("Adjective total: ", len(adj_total_ex_min))
print("Percentage of adjectives: ", (len(adj_total_ex_min)/len(total_ex_min_text)*100))
print("Unique adjective total: ", len(set(adj_total_ex_min)))
print("Percentage of unique adjectives: ", len(set(adj_total_ex_min))/len(adj_total_ex_min)*100)

ALL CONTENT WORDS
Content word total:  1888
Percentage of content words:  33.63620167468377
Unique content word total:  1044
Percentage of unique content words:  55.29661016949152

NOUNS
Noun total:  907
Percentage of nouns:  16.158916800285052
Unique noun total:  524
Percentage of unique nouns:  57.7728776185226

VERBS
Verb total:  598
Percentage of verbs:  10.653839301621236
Unique verb total:  294
Percentage of unique nouns:  49.163879598662206

ADJECTIVES
Adjective total:  383
Percentage of adjectives:  6.823445572777481
Unique adjective total:  226
Percentage of unique adjectives:  59.00783289817232


In [222]:
count_content_words_total_ex_min = Counter(content_words_total_ex_min)
count_noun_total_ex_min = Counter(noun_total_ex_min)
count_verb_total_ex_min = Counter(verb_total_ex_min)
count_adj_total_ex_min = Counter(adj_total_ex_min)

print("MOST COMMON CONTENT WORDS")
print(count_content_words_total_ex_min.most_common(31))
print("\nMOST COMMON NOUNS")
print(count_noun_total_ex_min.most_common(20))
print("\nMOST COMMON VERBS")
print(count_verb_total_ex_min.most_common(20))
print("\nMOST COMMON ADJECTIVES")
print(count_adj_total_ex_min.most_common(21))

MOST COMMON CONTENT WORDS
[(('say', 'VERB'), 30), (('make', 'VERB'), 16), (('go', 'VERB'), 14), (('come', 'VERB'), 11), (('have', 'VERB'), 10), (('history', 'NOUN'), 10), (('tell', 'VERB'), 10), (('do', 'VERB'), 10), (('take', 'VERB'), 9), (('government', 'NOUN'), 9), (('evidence', 'NOUN'), 9), (('give', 'VERB'), 9), (('man', 'NOUN'), 9), (('world', 'NOUN'), 9), (('see', 'VERB'), 9), (('gun', 'NOUN'), 8), (('year', 'NOUN'), 8), (('official', 'NOUN'), 8), (('life', 'NOUN'), 8), (('thing', 'NOUN'), 7), (('happen', 'VERB'), 7), (('time', 'NOUN'), 7), (('be', 'VERB'), 7), (('case', 'NOUN'), 7), (('month', 'NOUN'), 7), (('think', 'VERB'), 7), (('gay', 'ADJ'), 7), (('president', 'NOUN'), 6), (('high', 'ADJ'), 6), (('other', 'ADJ'), 6), (('know', 'VERB'), 6)]

MOST COMMON NOUNS
[(('history', 'NOUN'), 10), (('government', 'NOUN'), 9), (('evidence', 'NOUN'), 9), (('man', 'NOUN'), 9), (('world', 'NOUN'), 9), (('gun', 'NOUN'), 8), (('year', 'NOUN'), 8), (('official', 'NOUN'), 8), (('life', 'NOUN'

### Flag_Waving

This first section will focus on the extraction of the information described above focusing on the Flag_Waving class.

In [None]:
df_flag_waving = df.loc[(df['Technique'] == "Flag_Waving")]
df_flag_waving

In [224]:
X_flag_waving = list()
for x in df_flag_waving['Text']:
    X_flag_waving.append(str(x))

In [225]:
doc_flag_waving = list()
for i in X_flag_waving:
    doc_flag_waving.append(nlp(i))

In [226]:
total_waving = "".join(X_flag_waving)
total_waving_text = nlp(total_waving)
print("Total of tokens: ", len(total_waving_text))

Total of tokens:  5133


In [227]:
content_words_total_waving = [(token.lemma_,token.pos_) for token in total_waving_text if token.pos_ in ["VERB","ADJ","NOUN"]]
noun_total_waving = [(token.lemma_,token.pos_) for token in total_waving_text if token.pos_ in ["NOUN"]]
verb_total_waving = [(token.lemma_,token.pos_) for token in total_waving_text if token.pos_ in ["VERB"]]
adj_total_waving = [(token.lemma_,token.pos_) for token in total_waving_text if token.pos_ in ["ADJ"]]

print("ALL CONTENT WORDS")
print("Content word total: ", len(content_words_total_waving))
print("Percentage of content words: ", (len(content_words_total_waving)/len(total_waving_text)*100))
print("Unique content word total: ", len(set(content_words_total_waving)))
print("Percentage of unique content words: ", len(set(content_words_total_waving))/len(content_words_total_waving)*100)
print("\nNOUNS")
print("Noun total: ", len(noun_total_waving))
print("Percentage of nouns: ", (len(noun_total_waving)/len(total_waving_text)*100))
print("Unique noun total: ", len(set(noun_total_waving)))
print("Percentage of unique nouns: ", len(set(noun_total_waving))/len(noun_total_waving)*100)
print("\nVERBS")
print("Verb total: ", len(verb_total_waving))
print("Percentage of verbs: ", (len(verb_total_waving)/len(total_waving_text)*100))
print("Unique verb total: ", len(set(verb_total_waving)))
print("Percentage of unique nouns: ", len(set(verb_total_waving))/len(verb_total_waving)*100)
print("\nADJECTIVES")
print("Adjective total: ", len(adj_total_waving))
print("Percentage of adjectives: ", (len(adj_total_waving)/len(total_waving_text)*100))
print("Unique adjective total: ", len(set(adj_total_waving)))
print("Percentage of unique adjectives: ", len(set(adj_total_waving))/len(adj_total_waving)*100)

ALL CONTENT WORDS
Content word total:  1699
Percentage of content words:  33.099551918955775
Unique content word total:  892
Percentage of unique content words:  52.501471453796356

NOUNS
Noun total:  827
Percentage of nouns:  16.111435807519968
Unique noun total:  456
Percentage of unique nouns:  55.139056831922616

VERBS
Verb total:  574
Percentage of verbs:  11.18254432105981
Unique verb total:  274
Percentage of unique nouns:  47.73519163763066

ADJECTIVES
Adjective total:  298
Percentage of adjectives:  5.805571790375998
Unique adjective total:  162
Percentage of unique adjectives:  54.36241610738255


In [228]:
count_content_words_total_waving = Counter(content_words_total_waving)
count_noun_total_waving = Counter(noun_total_waving)
count_verb_total_waving = Counter(verb_total_waving)
count_adj_total_waving = Counter(adj_total_waving)

print("MOST COMMON CONTENT WORDS")
print(count_content_words_total_waving.most_common(31))
print("\nMOST COMMON NOUNS")
print(count_noun_total_waving.most_common(20))
print("\nMOST COMMON VERBS")
print(count_verb_total_waving.most_common(20))
print("\nMOST COMMON ADJECTIVES")
print(count_adj_total_waving.most_common(21))

MOST COMMON CONTENT WORDS
[(('american', 'ADJ'), 45), (('people', 'NOUN'), 32), (('say', 'VERB'), 30), (('have', 'VERB'), 26), (('take', 'VERB'), 13), (('make', 'VERB'), 11), (('call', 'VERB'), 10), (('year', 'NOUN'), 10), (('know', 'VERB'), 10), (('keep', 'VERB'), 9), (('time', 'NOUN'), 9), (('day', 'NOUN'), 9), (('power', 'NOUN'), 8), (('important', 'ADJ'), 8), (('right', 'NOUN'), 8), (('tell', 'VERB'), 8), (('officer', 'NOUN'), 8), (('report', 'NOUN'), 8), (('election', 'NOUN'), 8), (('country', 'NOUN'), 7), (('do', 'VERB'), 7), (('war', 'NOUN'), 7), (('assassination', 'NOUN'), 7), (('go', 'VERB'), 7), (('nation', 'NOUN'), 7), (('want', 'VERB'), 7), (('see', 'VERB'), 7), (('home', 'NOUN'), 6), (('fight', 'VERB'), 6), (('way', 'NOUN'), 6), (('support', 'VERB'), 6)]

MOST COMMON NOUNS
[(('people', 'NOUN'), 32), (('year', 'NOUN'), 10), (('time', 'NOUN'), 9), (('day', 'NOUN'), 9), (('power', 'NOUN'), 8), (('right', 'NOUN'), 8), (('officer', 'NOUN'), 8), (('report', 'NOUN'), 8), (('elect

-------------------------------------------------------------------------------

# Named Entity Recogniton

From every paragraph, different kind of entities and information about them have been extracted. 

As done in the previous section, this first part will focus on the extraction of these information regardless of the persuasion technique, while the following will consider each class separately (and will follow the same data preparation done for the general one).

In [229]:
#Extracting named entities and printing the total of NE and type of NE categories extracted
total_entities_dict = {key: list(g) for key, g in groupby(sorted(total_text.ents, key=lambda x: x.label_), lambda x: x.label_)}
print("ALL ENTITIES")
print("All categories extracted: ", total_entities_dict.keys())
print("Number of categories extracted: ", len(total_entities_dict))

total_entities_value_list = list()
for i in total_entities_dict.values():
    total_entities_value_list.append(i)
    
total_entities= len(sum(total_entities_value_list, []))
print("Total number of entities: ", total_entities)

#Printing total and percentage of unique named entities
print("\nALL UNIQUE ENTITIES")
unique_entities_dict = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(total_text.ents, key=lambda x: x.label_), lambda x: x.label_)}

unique_entities_value_list = list()
for i in unique_entities_dict.values():
    unique_entities_value_list.append(i)
    
unique_entities= len(sum(unique_entities_value_list, []))
print("Total number of unique entities: ", unique_entities)
print("Percentage of unique entities: ", (unique_entities/total_entities)*100)

ALL ENTITIES
All categories extracted:  dict_keys(['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART'])
Number of categories extracted:  18
Total number of entities:  8078

ALL UNIQUE ENTITIES
Total number of unique entities:  3390
Percentage of unique entities:  41.965833127011635


In [230]:
# Counting the total occurrences for each label
total_cardinal = len(total_entities_dict['CARDINAL'])
total_date = len(total_entities_dict['DATE'])
total_event = len(total_entities_dict['EVENT'])
total_fac = len(total_entities_dict['FAC'])
total_gpe = len(total_entities_dict['GPE'])
total_language = len(total_entities_dict['LANGUAGE'])
total_law = len(total_entities_dict['LAW'])
total_loc = len(total_entities_dict['LOC'])
total_money = len(total_entities_dict['MONEY'])
total_norp = len(total_entities_dict['NORP'])
total_ordinal = len(total_entities_dict['ORDINAL'])
total_org = len(total_entities_dict['ORG'])
total_percent = len(total_entities_dict['PERCENT'])
total_person = len(total_entities_dict['PERSON'])
total_product = len(total_entities_dict['PRODUCT'])
total_quantity = len(total_entities_dict['QUANTITY'])
total_time = len(total_entities_dict['TIME'])
total_woa = len(total_entities_dict['WORK_OF_ART'])

In [231]:
# Counting the unique occurrences for each label
unique_cardinal = len(unique_entities_dict['CARDINAL'])
unique_date = len(unique_entities_dict['DATE'])
unique_event = len(unique_entities_dict['EVENT'])
unique_fac = len(unique_entities_dict['FAC'])
unique_gpe = len(unique_entities_dict['GPE'])
unique_language = len(unique_entities_dict['LANGUAGE'])
unique_law = len(unique_entities_dict['LAW'])
unique_loc = len(unique_entities_dict['LOC'])
unique_money = len(unique_entities_dict['MONEY'])
unique_norp = len(unique_entities_dict['NORP'])
unique_ordinal = len(unique_entities_dict['ORDINAL'])
unique_org = len(unique_entities_dict['ORG'])
unique_percent = len(unique_entities_dict['PERCENT'])
unique_person = len(unique_entities_dict['PERSON'])
unique_product = len(unique_entities_dict['PRODUCT'])
unique_quantity = len(unique_entities_dict['QUANTITY'])
unique_time = len(unique_entities_dict['TIME'])
unique_woa = len(unique_entities_dict['WORK_OF_ART'])

In [232]:
# Printing total, percentage, unique entities for each persuasion technique
print("CARDINAL")
print("Total of extracted cardinal numbers: ", total_cardinal)
print("Percentage of extracted cardinal numbers: ", (total_cardinal/total_entities)*100)
print("Total of unique cardinal numbers: ", unique_cardinal)
print("Percentage of unique cardinal numbers: ", (unique_cardinal/total_cardinal)*100)
print("\nDATE")
print("Total of extracted dates: ", total_date)
print("Percentage of extracted dates: ", (total_date/total_entities)*100)
print("Total of unique dates: ", unique_date)
print("Percentage of unique dates: ", (unique_date/total_date)*100)
print("\nEVENT")
print("Total of extracted events: ", total_event)
print("Percentage of extracted events: ", (total_event/total_entities)*100)
print("Total of unique events: ", unique_event)
print("Percentage of unique events: ", (unique_event/total_event)*100)
print("\nFAC")
print("Total of extracted facilities: ", total_fac)
print("Percentage of extracted facilities: ", (total_fac/total_entities)*100)
print("Total of unique facilities: ", unique_fac)
print("Percentage of unique facilities: ", (unique_fac/total_fac)*100)
print("\nGPE")
print("Total of extracted countries or cities: ", total_gpe)
print("Percentage of extracted countries or cities: ", (total_gpe/total_entities)*100)
print("Total of unique countries or cities: ", unique_gpe)
print("Percentage of unique countries or cities: ", (unique_gpe/total_gpe)*100)
print("\nLANGUAGE")
print("Total of extracted languages: ", total_language)
print("Percentage of extracted languages: ", (total_language/total_entities)*100)
print("Total of unique languages: ", unique_language)
print("Percentage of unique languages: ", (unique_language/total_language)*100)
print("\nLAW")
print("Total of extracted laws: ", total_law)
print("Percentage of extracted laws: ", (total_law/total_entities)*100)
print("Total of unique laws: ", unique_law)
print("Percentage of unique laws: ", (unique_law/total_law)*100)
print("\nLOC")
print("Total of extracted generic locations: ", total_loc)
print("Percentage of extracted generic locations: ", (total_loc/total_entities)*100)
print("Total of unique generic locations: ", unique_loc)
print("Percentage of unique generic locations: ", (unique_loc/total_loc)*100)
print("\nMONEY")
print("Total of extracted money values: ", total_money)
print("Percentage of extracted money values: ", (total_money/total_entities)*100)
print("Total of unique money values: ", unique_money)
print("Percentage of unique money values: ", (unique_money/total_money)*100)
print("\nNORP")
print("Total of extracted nationalities, religious or political groups: ", total_norp)
print("Percentage of extracted nationalities, religious or political groups: ", (total_norp/total_entities)*100)
print("Total of unique nationalities, religious or political groups: ", unique_norp)
print("Percentage of unique nationalities, religious or political groups: ", (unique_norp/total_norp)*100)
print("\nORDINAL")
print("Total of extracted ordinal numbers: ", total_ordinal)
print("Percentage extracted ordinal numbers: ", (total_ordinal/total_entities)*100)
print("Total of unique ordinal numbers: ", unique_ordinal)
print("Percentage of unique ordinal numbers: ", (unique_ordinal/total_ordinal)*100)
print("\nORG")
print("Total of extracted companies or organizations: ", total_org)
print("Percentage extracted companies or organizations: ", (total_org/total_entities)*100)
print("Total of unique companies or organizations: ", unique_org)
print("Percentage of unique companies or organizations: ", (unique_org/total_org)*100)
print("\nPERCENT")
print("Total of extracted percentages: ", total_percent)
print("Percentage extracted percentages: ", (total_percent/total_entities)*100)
print("Total of unique percentages: ", unique_percent)
print("Percentage of unique percentages: ", (unique_percent/total_percent)*100)
print("\nPERSON")
print("Total of extracted people: ", total_person)
print("Percentage extracted people: ", (total_person/total_entities)*100)
print("Total of unique people: ", unique_person)
print("Percentage of unique people: ", (unique_person/total_person)*100)
print("\nPRODUCT")
print("Total of extracted products: ", total_product)
print("Percentage extracted products: ", (total_product/total_entities)*100)
print("Total of unique products: ", unique_product)
print("Percentage of unique products: ", (unique_product/total_product)*100)
print("\nQUANTITY")
print("Total of extracted measurements: ", total_quantity)
print("Percentage extracted measurements: ", (total_quantity/total_entities)*100)
print("Total of unique measurements: ", unique_quantity)
print("Percentage of unique measurements: ", (unique_quantity/total_quantity)*100)
print("\nTIME")
print("Total of extracted times: ", total_time)
print("Percentage of extracted times: ", (total_time/total_entities)*100)
print("Total of unique times: ", unique_time)
print("Percentage of unique times: ", (unique_time/total_time)*100)
print("\nWORK OF ART")
print("Total of extracted works of art: ", total_woa)
print("Percentage of extracted works of art: ", (total_woa/total_entities)*100)
print("Total of unique works of art: ", unique_woa)
print("Percentage of unique works of art: ", (unique_woa/total_woa)*100)

CARDINAL
Total of extracted cardinal numbers:  457
Percentage of extracted cardinal numbers:  5.657340925971775
Total of unique cardinal numbers:  185
Percentage of unique cardinal numbers:  40.48140043763676

DATE
Total of extracted dates:  907
Percentage of extracted dates:  11.228026739291904
Total of unique dates:  504
Percentage of unique dates:  55.567805953693494

EVENT
Total of extracted events:  38
Percentage of extracted events:  0.4704134686803664
Total of unique events:  29
Percentage of unique events:  76.31578947368422

FAC
Total of extracted facilities:  61
Percentage of extracted facilities:  0.7551374102500619
Total of unique facilities:  22
Percentage of unique facilities:  36.0655737704918

GPE
Total of extracted countries or cities:  1263
Percentage of extracted countries or cities:  15.635058182718495
Total of unique countries or cities:  318
Percentage of unique countries or cities:  25.17814726840855

LANGUAGE
Total of extracted languages:  9
Percentage of extrac

In [233]:
# Printing the 30 most common entities in the total of texts
total_ents_count = Counter()

for ent in total_text.ents:
    total_ents_count[f"{ent.label_}: {ent.text}"] += 1

print("MOST COMMON ENTITIES")
for key, val in total_ents_count.most_common(30):
    print(val, key, sep="\t")

MOST COMMON ENTITIES
116	GPE: Iran
104	ORG: Trump
99	ORG: Church
85	CARDINAL: one
84	ORG: FBI
82	GPE: US
77	NORP: American
72	PERSON: Trump
72	PERSON: Francis
67	GPE: U.S.
59	CARDINAL: two
56	ORDINAL: first
45	GPE: Syria
43	GPE: the United States
40	NORP: Democrats
39	NORP: Muslim
39	GPE: Russia
38	PERSON: Pope
37	ORG: CIA
37	NORP: Islamic
36	GPE: America
33	PERSON: Clinton
33	NORP: Catholic
33	DATE: today
31	ORG: Guardian
31	NORP: Russian
31	PERSON: McCarrick
28	ORG: Assange
28	PERSON: Benedict
27	PERSON: Donald Trump


In [234]:
# Counting each occurrences of NE for each different categories
total_cardinal_count = Counter()
total_date_count = Counter()
total_event_count = Counter()
total_fac_count = Counter()
total_gpe_count = Counter()
total_language_count = Counter()
total_law_count = Counter()
total_loc_count = Counter()
total_money_count = Counter()
total_norp_count = Counter()
total_ordinal_count = Counter()
total_org_count = Counter()
total_percent_count = Counter()
total_person_count = Counter()
total_product_count = Counter()
total_quantity_count = Counter()
total_time_count = Counter()
total_woa_count = Counter()

for ent in total_text.ents:
    if (ent.label_ == "CARDINAL"):
        total_cardinal_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "DATE"):
        total_date_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "EVENT"):
        total_event_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "FAC"):
        total_fac_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "GPE"):
        total_gpe_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LANGUAGE"):
        total_language_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LAW"):
        total_law_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LOC"):
        total_loc_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "MONEY"):
        total_money_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "NORP"):
        total_norp_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORDINAL"):
        total_ordinal_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORG"):
        total_org_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PERCENT"):
        total_percent_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "PERSON"):
        total_person_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PRODUCT"):
        total_product_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "QUANTITY"):
        total_quantity_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "TIME"):
        total_time_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "WORK_OF_ART"):
        total_woa_count[f"{ent.label_}: {ent.text}"] += 1 

In [235]:
# Printing the most common named entities for each types
print("MOST COMMON CARDINALS")        
for key, val in total_cardinal_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON DATES")        
for key, val in total_date_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON EVENTS")        
for key, val in total_event_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON FACILITIES")        
for key, val in total_fac_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON COUNTRIES OR CITIES")        
for key, val in total_gpe_count.most_common(15):
    print(val, key, sep="\t")  

print("\nMOST COMMON LANGUAGES")        
for key, val in total_language_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LAWS")        
for key, val in total_law_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LOCATIONS")        
for key, val in total_loc_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON MONEY VALUES")        
for key, val in total_money_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON NATIONALITIES OR GROUPS")        
for key, val in total_norp_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON ORDINALS")        
for key, val in total_ordinal_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON ORGANIZATIONS")        
for key, val in total_org_count.most_common(20):
    print(val, key, sep="\t")  

print("\nMOST COMMON PERCENTAGES")        
for key, val in total_percent_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON PEOPLE")        
for key, val in total_person_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON PRODUCTS")        
for key, val in total_product_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON QUANTITIES")        
for key, val in total_quantity_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON TIMES")        
for key, val in total_time_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON WORK OF ARTS")        
for key, val in total_woa_count.most_common(10):
    print(val, key, sep="\t") 

MOST COMMON CARDINALS
85	CARDINAL: one
59	CARDINAL: two
23	CARDINAL: One
19	CARDINAL: three
10	CARDINAL: 11
9	CARDINAL: millions
9	CARDINAL: four
6	CARDINAL: five
6	CARDINAL: thousands
4	CARDINAL: Two

MOST COMMON DATES
33	DATE: today
26	DATE: Friday
20	DATE: Thursday
18	DATE: Tuesday
15	DATE: Monday
15	DATE: 2015
13	DATE: Wednesday
13	DATE: 2016
12	DATE: 2009
11	DATE: Sunday

MOST COMMON EVENTS
4	EVENT: the Second Amendment
4	EVENT: the Cold War
3	EVENT: Holocaust
2	EVENT: the Third Secret
1	EVENT: Geneva -
1	EVENT: the New Mass
1	EVENT: Hurricane Maria’s
1	EVENT: Occupation
1	EVENT: the Green Revolution
1	EVENT: WWII

MOST COMMON FACILITIES
26	FAC: Vatican
9	FAC: Vatican II
3	FAC: Metro
2	FAC: Route 91 Harvest
2	FAC: the Oval Office
2	FAC: the White House
2	FAC: the Fourth Amendment
1	FAC: Alexandria’s
1	FAC: St. Cyr
1	FAC: Route 91

MOST COMMON COUNTRIES OR CITIES
116	GPE: Iran
82	GPE: US
67	GPE: U.S.
45	GPE: Syria
43	GPE: the United States
39	GPE: Russia
36	GPE: America
24	GPE: Isr

### Loaded_Language

This first section will focus on the extraction of the information described above focusing on the Loaded_Language class.

In [236]:
# Extracting and counting each NE extracted and the total of NE categories
total_entities_dict_loaded = {key: list(g) for key, g in groupby(sorted(total_loaded_text.ents, key=lambda x: x.label_), lambda x: x.label_)}
print("ALL ENTITIES")
print("All categories extracted: ", total_entities_dict_loaded.keys())
print("Number of categories extracted: ", len(total_entities_dict_loaded))

total_entities_value_list_loaded = list()
for i in total_entities_dict_loaded.values():
    total_entities_value_list_loaded.append(i)
    
total_entities_loaded= len(sum(total_entities_value_list_loaded, []))
print("Total number of entities: ", total_entities_loaded)

# Counting and printing total and percentage of each unique entities
unique_entities_dict_loaded = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(total_loaded_text.ents, key=lambda x: x.label_), lambda x: x.label_)}
print("\nUNIQUE ENTITIES")

unique_entities_value_list_loaded = list()
for i in unique_entities_dict_loaded.values():
    unique_entities_value_list_loaded.append(i)
    
unique_entities_loaded= len(sum(unique_entities_value_list_loaded, []))
print("Total number of unique entities: ", unique_entities_loaded)
print("Percentage of unique entities: ", (unique_entities_loaded/total_entities_loaded)*100)

ALL ENTITIES
All categories extracted:  dict_keys(['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART'])
Number of categories extracted:  18
Total number of entities:  3644

UNIQUE ENTITIES
Total number of unique entities:  1785
Percentage of unique entities:  48.98463227222832


In [237]:
# Counting the total occurrences for each label
total_cardinal_loaded = len(total_entities_dict_loaded['CARDINAL'])
total_date_loaded = len(total_entities_dict_loaded['DATE'])
total_event_loaded = len(total_entities_dict_loaded['EVENT'])
total_fac_loaded = len(total_entities_dict_loaded['FAC'])
total_gpe_loaded = len(total_entities_dict_loaded['GPE'])
total_language_loaded = len(total_entities_dict_loaded['LANGUAGE'])
total_law_loaded = len(total_entities_dict_loaded['LAW'])
total_loc_loaded = len(total_entities_dict_loaded['LOC'])
total_money_loaded = len(total_entities_dict_loaded['MONEY'])
total_norp_loaded = len(total_entities_dict_loaded['NORP'])
total_ordinal_loaded = len(total_entities_dict_loaded['ORDINAL'])
total_org_loaded = len(total_entities_dict_loaded['ORG'])
total_percent_loaded = len(total_entities_dict_loaded['PERCENT'])
total_person_loaded = len(total_entities_dict_loaded['PERSON'])
total_product_loaded = len(total_entities_dict_loaded['PRODUCT'])
total_quantity_loaded = len(total_entities_dict_loaded['QUANTITY'])
total_time_loaded = len(total_entities_dict_loaded['TIME'])
total_woa_loaded = len(total_entities_dict_loaded['WORK_OF_ART'])

In [238]:
# Counting the unique occurrences for each label
unique_cardinal_loaded = len(unique_entities_dict_loaded['CARDINAL'])
unique_date_loaded = len(unique_entities_dict_loaded['DATE'])
unique_event_loaded = len(unique_entities_dict_loaded['EVENT'])
unique_fac_loaded = len(unique_entities_dict_loaded['FAC'])
unique_gpe_loaded = len(unique_entities_dict_loaded['GPE'])
unique_language_loaded = len(unique_entities_dict_loaded['LANGUAGE'])
unique_law_loaded = len(unique_entities_dict_loaded['LAW'])
unique_loc_loaded = len(unique_entities_dict_loaded['LOC'])
unique_money_loaded = len(unique_entities_dict_loaded['MONEY'])
unique_norp_loaded = len(unique_entities_dict_loaded['NORP'])
unique_ordinal_loaded = len(unique_entities_dict_loaded['ORDINAL'])
unique_org_loaded = len(unique_entities_dict_loaded['ORG'])
unique_percent_loaded = len(unique_entities_dict_loaded['PERCENT'])
unique_person_loaded = len(unique_entities_dict_loaded['PERSON'])
unique_product_loaded = len(unique_entities_dict_loaded['PRODUCT'])
unique_quantity_loaded = len(unique_entities_dict_loaded['QUANTITY'])
unique_time_loaded = len(unique_entities_dict_loaded['TIME'])
unique_woa_loaded = len(unique_entities_dict_loaded['WORK_OF_ART'])

In [239]:
# Printing total, percentage and unique entities extracted of each categories
print("CARDINAL")
print("Total of extracted cardinal numbers: ", total_cardinal_loaded)
print("Percentage of extracted cardinal numbers: ", (total_cardinal_loaded/total_entities_loaded)*100)
print("Total of unique cardinal numbers: ", unique_cardinal_loaded)
print("Percentage of unique cardinal numbers: ", (unique_cardinal_loaded/total_cardinal_loaded)*100)
print("\nDATE")
print("Total of extracted dates: ", total_date_loaded)
print("Percentage of extracted dates: ", (total_date_loaded/total_entities_loaded)*100)
print("Total of unique dates: ", unique_date_loaded)
print("Percentage of unique dates: ", (unique_date_loaded/total_date_loaded)*100)
print("\nEVENT")
print("Total of extracted events: ", total_event_loaded)
print("Percentage of extracted events: ", (total_event_loaded/total_entities_loaded)*100)
print("Total of unique events: ", unique_event_loaded)
print("Percentage of unique events: ", (unique_event_loaded/total_event_loaded)*100)
print("\nFAC")
print("Total of extracted facilities: ", total_fac_loaded)
print("Percentage of extracted facilities: ", (total_fac_loaded/total_entities_loaded)*100)
print("Total of unique facilities: ", unique_fac_loaded)
print("Percentage of unique facilities: ", (unique_fac_loaded/total_fac_loaded)*100)
print("\nGPE")
print("Total of extracted countries or cities: ", total_gpe_loaded)
print("Percentage of extracted countries or cities: ", (total_gpe_loaded/total_entities_loaded)*100)
print("Total of unique countries or cities: ", unique_gpe_loaded)
print("Percentage of unique countries or cities: ", (unique_gpe_loaded/total_gpe_loaded)*100)
print("\nLANGUAGE")
print("Total of extracted languages: ", total_language_loaded)
print("Percentage of extracted languages: ", (total_language_loaded/total_entities_loaded)*100)
print("Total of unique languages: ", unique_language_loaded)
print("Percentage of unique languages: ", (unique_language_loaded/total_language_loaded)*100)
print("\nLAW")
print("Total of extracted laws: ", total_law_loaded)
print("Percentage of extracted laws: ", (total_law_loaded/total_entities_loaded)*100)
print("Total of unique laws: ", unique_law_loaded)
print("Percentage of unique laws: ", (unique_law_loaded/total_law_loaded)*100)
print("\nLOC")
print("Total of extracted generic locations: ", total_loc_loaded)
print("Percentage of extracted generic locations: ", (total_loc_loaded/total_entities_loaded)*100)
print("Total of unique generic locations: ", unique_loc_loaded)
print("Percentage of unique generic locations: ", (unique_loc_loaded/total_loc_loaded)*100)
print("\nMONEY")
print("Total of extracted money values: ", total_money_loaded)
print("Percentage of extracted money values: ", (total_money_loaded/total_entities_loaded)*100)
print("Total of unique money values: ", unique_money_loaded)
print("Percentage of unique money values: ", (unique_money_loaded/total_money_loaded)*100)
print("\nNORP")
print("Total of extracted nationalities, religious or political groups: ", total_norp_loaded)
print("Percentage of extracted nationalities, religious or political groups: ", (total_norp_loaded/total_entities_loaded)*100)
print("Total of unique nationalities, religious or political groups: ", unique_norp_loaded)
print("Percentage of unique nationalities, religious or political groups: ", (unique_norp_loaded/total_norp_loaded)*100)
print("\nORDINAL")
print("Total of extracted ordinal numbers: ", total_ordinal_loaded)
print("Percentage extracted ordinal numbers: ", (total_ordinal_loaded/total_entities_loaded)*100)
print("Total of unique ordinal numbers: ", unique_ordinal_loaded)
print("Percentage of unique ordinal numbers: ", (unique_ordinal_loaded/total_ordinal_loaded)*100)
print("\nORG")
print("Total of extracted companies or organizations: ", total_org_loaded)
print("Percentage extracted companies or organizations: ", (total_org_loaded/total_entities_loaded)*100)
print("Total of unique companies or organizations: ", unique_org_loaded)
print("Percentage of unique companies or organizations: ", (unique_org_loaded/total_org_loaded)*100)
print("\nPERCENT")
print("Total of extracted percentages: ", total_percent_loaded)
print("Percentage extracted percentages: ", (total_percent_loaded/total_entities_loaded)*100)
print("Total of unique percentages: ", unique_percent_loaded)
print("Percentage of unique percentages: ", (unique_percent_loaded/total_percent_loaded)*100)
print("\nPERSON")
print("Total of extracted people: ", total_person_loaded)
print("Percentage extracted people: ", (total_person_loaded/total_entities_loaded)*100)
print("Total of unique people: ", unique_person_loaded)
print("Percentage of unique people: ", (unique_person_loaded/total_person_loaded)*100)
print("\nPRODUCT")
print("Total of extracted products: ", total_product_loaded)
print("Percentage extracted products: ", (total_product_loaded/total_entities_loaded)*100)
print("Total of unique products: ", unique_product_loaded)
print("Percentage of unique products: ", (unique_product_loaded/total_product_loaded)*100)
print("\nQUANTITY")
print("Total of extracted measurements: ", total_quantity_loaded)
print("Percentage extracted measurements: ", (total_quantity_loaded/total_entities_loaded)*100)
print("Total of unique measurements: ", unique_quantity_loaded)
print("Percentage of unique measurements: ", (unique_quantity_loaded/total_quantity_loaded)*100)
print("\nTIME")
print("Total of extracted times: ", total_time_loaded)
print("Percentage of extracted times: ", (total_time_loaded/total_entities_loaded)*100)
print("Total of unique times: ", unique_time_loaded)
print("Percentage of unique times: ", (unique_time_loaded/total_time_loaded)*100)
print("\nWORK OF ART")
print("Total of extracted works of art: ", total_woa_loaded)
print("Percentage of extracted works of art: ", (total_woa_loaded/total_entities_loaded)*100)
print("Total of unique works of art: ", unique_woa_loaded)
print("Percentage of unique works of art: ", (unique_woa_loaded/total_woa_loaded)*100)

CARDINAL
Total of extracted cardinal numbers:  214
Percentage of extracted cardinal numbers:  5.872667398463227
Total of unique cardinal numbers:  107
Percentage of unique cardinal numbers:  50.0

DATE
Total of extracted dates:  408
Percentage of extracted dates:  11.19648737650933
Total of unique dates:  259
Percentage of unique dates:  63.48039215686274

EVENT
Total of extracted events:  19
Percentage of extracted events:  0.5214050493962679
Total of unique events:  15
Percentage of unique events:  78.94736842105263

FAC
Total of extracted facilities:  30
Percentage of extracted facilities:  0.823271130625686
Total of unique facilities:  11
Percentage of unique facilities:  36.666666666666664

GPE
Total of extracted countries or cities:  512
Percentage of extracted countries or cities:  14.050493962678376
Total of unique countries or cities:  184
Percentage of unique countries or cities:  35.9375

LANGUAGE
Total of extracted languages:  5
Percentage of extracted languages:  0.1372118

In [240]:
# Printing the 30 most common entities in the texts
loaded_ents_count = Counter()

for ent in total_loaded_text.ents:
    loaded_ents_count[f"{ent.label_}: {ent.text}"] += 1

print("MOST COMMON ENTITIES")
for key, val in loaded_ents_count.most_common(30):
    print(val, key, sep="\t")

MOST COMMON ENTITIES
66	ORG: Church
48	ORG: Trump
46	PERSON: Francis
44	GPE: Iran
42	ORG: FBI
39	PERSON: Trump
34	GPE: US
34	CARDINAL: one
30	CARDINAL: two
29	ORDINAL: first
25	NORP: Democrats
25	PERSON: McCarrick
24	PERSON: Pope
22	NORP: Catholic
21	NORP: American
21	PERSON: Clinton
19	PERSON: Muhammad
18	GPE: U.S.
17	GPE: America
17	NORP: Islamic
16	GPE: the United States
15	GPE: Obama
15	ORG: CIA
14	ORG: CNN
14	PERSON: Obama
14	ORG: House
14	FAC: Vatican
14	NORP: Iranian
14	DATE: today
14	CARDINAL: One


In [241]:
# Counting the occurrencies of each NE for each categories
loaded_cardinal_count = Counter()
loaded_date_count = Counter()
loaded_event_count = Counter()
loaded_fac_count = Counter()
loaded_gpe_count = Counter()
loaded_language_count = Counter()
loaded_law_count = Counter()
loaded_loc_count = Counter()
loaded_money_count = Counter()
loaded_norp_count = Counter()
loaded_ordinal_count = Counter()
loaded_org_count = Counter()
loaded_percent_count = Counter()
loaded_person_count = Counter()
loaded_product_count = Counter()
loaded_quantity_count = Counter()
loaded_time_count = Counter()
loaded_woa_count = Counter()

for ent in total_loaded_text.ents:
    if (ent.label_ == "CARDINAL"):
        loaded_cardinal_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "DATE"):
        loaded_date_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "EVENT"):
        loaded_event_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "FAC"):
        loaded_fac_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "GPE"):
        loaded_gpe_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LANGUAGE"):
        loaded_language_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LAW"):
        loaded_law_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LOC"):
        loaded_loc_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "MONEY"):
        loaded_money_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "NORP"):
        loaded_norp_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORDINAL"):
        loaded_ordinal_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORG"):
        loaded_org_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PERCENT"):
        loaded_percent_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "PERSON"):
        loaded_person_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PRODUCT"):
        loaded_product_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "QUANTITY"):
        loaded_quantity_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "TIME"):
        loaded_time_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "WORK_OF_ART"):
        loaded_woa_count[f"{ent.label_}: {ent.text}"] += 1 

In [242]:
# Printing the most common entities for each categories
print("MOST COMMON CARDINALS")        
for key, val in loaded_cardinal_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON DATES")        
for key, val in loaded_date_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON EVENTS")        
for key, val in loaded_event_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON FACILITIES")        
for key, val in loaded_fac_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON COUNTRIES OR CITIES")        
for key, val in loaded_gpe_count.most_common(15):
    print(val, key, sep="\t")  

print("\nMOST COMMON LANGUAGES")        
for key, val in loaded_language_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LAWS")        
for key, val in loaded_law_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LOCATIONS")        
for key, val in loaded_loc_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON MONEY VALUES")        
for key, val in loaded_money_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON NATIONALITIES OR GROUPS")        
for key, val in loaded_norp_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON ORDINALS")        
for key, val in loaded_ordinal_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON ORGANIZATIONS")        
for key, val in loaded_org_count.most_common(20):
    print(val, key, sep="\t")  

print("\nMOST COMMON PERCENTAGES")        
for key, val in loaded_percent_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON PEOPLE")        
for key, val in loaded_person_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON PRODUCTS")        
for key, val in loaded_product_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON QUANTITIES")        
for key, val in loaded_quantity_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON TIMES")        
for key, val in loaded_time_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON WORK OF ARTS")        
for key, val in loaded_woa_count.most_common(10):
    print(val, key, sep="\t")  

MOST COMMON CARDINALS
34	CARDINAL: one
30	CARDINAL: two
14	CARDINAL: One
9	CARDINAL: three
5	CARDINAL: millions
5	CARDINAL: four
4	CARDINAL: thousands
3	CARDINAL: five
2	CARDINAL: 12
2	CARDINAL: Two

MOST COMMON DATES
14	DATE: today
11	DATE: Friday
9	DATE: Tuesday
9	DATE: Thursday
7	DATE: 2009
7	DATE: Wednesday
6	DATE: Monday
6	DATE: 2015
5	DATE: March
5	DATE: Today

MOST COMMON EVENTS
3	EVENT: the Second Amendment
3	EVENT: the Cold War
1	EVENT: the New Mass
1	EVENT: Hurricane Maria’s
1	EVENT: the Green Revolution
1	EVENT: the Congressional Medal of Honor
1	EVENT: the Vicar of Christ
1	EVENT: Hurricane Irma
1	EVENT: the Middle Ages
1	EVENT: the Great Recession

MOST COMMON FACILITIES
14	FAC: Vatican
6	FAC: Vatican II
2	FAC: the Oval Office
1	FAC: Route 91
1	FAC: Garcia Zarate
1	FAC: Cardinal Sandri
1	FAC: the Vatican Curia
1	FAC: Faith
1	FAC: the Fourth Amendment
1	FAC: the Clinton Global Initiative

MOST COMMON COUNTRIES OR CITIES
44	GPE: Iran
34	GPE: US
18	GPE: U.S.
17	GPE: America
1

### Name-Calling-Labeling

This first section will focus on the extraction of the information described above focusing on the Name_Calling-Labeling class.

In [243]:
total_entities_dict_calling = {key: list(g) for key, g in groupby(sorted(total_calling_text.ents, key=lambda x: x.label_), lambda x: x.label_)}
print("ALL ENTITIES")
print("All categories extracted: ", total_entities_dict_calling.keys())
print("Number of categories extracted: ", len(total_entities_dict_calling))

total_entities_value_list_calling = list()
for i in total_entities_dict_calling.values():
    total_entities_value_list_calling.append(i)
    
total_entities_calling= len(sum(total_entities_value_list_calling, []))
print("Total number of entities: ", total_entities_calling)

unique_entities_dict_calling = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(total_calling_text.ents, key=lambda x: x.label_), lambda x: x.label_)}

unique_entities_value_list_calling = list()
for i in unique_entities_dict_calling.values():
    unique_entities_value_list_calling.append(i)
    
unique_entities_calling= len(sum(unique_entities_value_list_calling, []))
print("Total number of unique entities: ", unique_entities_calling)
print("Percentage of unique entities: ", (unique_entities_calling/total_entities_calling)*100)

ALL ENTITIES
All categories extracted:  dict_keys(['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'TIME', 'WORK_OF_ART'])
Number of categories extracted:  17
Total number of entities:  1325
Total number of unique entities:  846
Percentage of unique entities:  63.849056603773576


In [244]:
# Counting the total occurrences for each label
total_cardinal_calling = len(total_entities_dict_calling['CARDINAL'])
total_date_calling = len(total_entities_dict_calling['DATE'])
total_event_calling = len(total_entities_dict_calling['EVENT'])
total_fac_calling = len(total_entities_dict_calling['FAC'])
total_gpe_calling = len(total_entities_dict_calling['GPE'])
total_language_calling = len(total_entities_dict_calling['LANGUAGE'])
total_law_calling = len(total_entities_dict_calling['LAW'])
total_loc_calling = len(total_entities_dict_calling['LOC'])
total_money_calling = len(total_entities_dict_calling['MONEY'])
total_norp_calling = len(total_entities_dict_calling['NORP'])
total_ordinal_calling = len(total_entities_dict_calling['ORDINAL'])
total_org_calling = len(total_entities_dict_calling['ORG'])
total_percent_calling = len(total_entities_dict_calling['PERCENT'])
total_person_calling = len(total_entities_dict_calling['PERSON'])
total_product_calling = len(total_entities_dict_calling['PRODUCT'])
total_time_calling = len(total_entities_dict_calling['TIME'])
total_woa_calling = len(total_entities_dict_calling['WORK_OF_ART'])

In [245]:
# Counting the unique occurrences for each label
unique_cardinal_calling = len(unique_entities_dict_calling['CARDINAL'])
unique_date_calling = len(unique_entities_dict_calling['DATE'])
unique_event_calling = len(unique_entities_dict_calling['EVENT'])
unique_fac_calling = len(unique_entities_dict_calling['FAC'])
unique_gpe_calling = len(unique_entities_dict_calling['GPE'])
unique_language_calling = len(unique_entities_dict_calling['LANGUAGE'])
unique_law_calling = len(unique_entities_dict_calling['LAW'])
unique_loc_calling = len(unique_entities_dict_calling['LOC'])
unique_money_calling = len(unique_entities_dict_calling['MONEY'])
unique_norp_calling = len(unique_entities_dict_calling['NORP'])
unique_ordinal_calling = len(unique_entities_dict_calling['ORDINAL'])
unique_org_calling = len(unique_entities_dict_calling['ORG'])
unique_percent_calling = len(unique_entities_dict_calling['PERCENT'])
unique_person_calling = len(unique_entities_dict_calling['PERSON'])
unique_product_calling = len(unique_entities_dict_calling['PRODUCT'])
unique_time_calling = len(unique_entities_dict_calling['TIME'])
unique_woa_calling = len(unique_entities_dict_calling['WORK_OF_ART'])

In [246]:
print("CARDINAL")
print("Total of extracted cardinal numbers: ", total_cardinal_calling)
print("Percentage of extracted cardinal numbers: ", (total_cardinal_calling/total_entities_calling)*100)
print("Total of unique cardinal numbers: ", unique_cardinal_calling)
print("Percentage of unique cardinal numbers: ", (unique_cardinal_calling/total_cardinal_calling)*100)
print("\nDATE")
print("Total of extracted dates: ", total_date_calling)
print("Percentage of extracted dates: ", (total_date_calling/total_entities_calling)*100)
print("Total of unique dates: ", unique_date_calling)
print("Percentage of unique dates: ", (unique_date_calling/total_date_calling)*100)
print("\nEVENT")
print("Total of extracted events: ", total_event_calling)
print("Percentage of extracted events: ", (total_event_calling/total_entities_calling)*100)
print("Total of unique events: ", unique_event_calling)
print("Percentage of unique events: ", (unique_event_calling/total_event_calling)*100)
print("\nFAC")
print("Total of extracted facilities: ", total_fac_calling)
print("Percentage of extracted facilities: ", (total_fac_calling/total_entities_calling)*100)
print("Total of unique facilities: ", unique_fac_calling)
print("Percentage of unique facilities: ", (unique_fac_calling/total_fac_calling)*100)
print("\nGPE")
print("Total of extracted countries or cities: ", total_gpe_calling)
print("Percentage of extracted countries or cities: ", (total_gpe_calling/total_entities_calling)*100)
print("Total of unique countries or cities: ", unique_gpe_calling)
print("Percentage of unique countries or cities: ", (unique_gpe_calling/total_gpe_calling)*100)
print("\nLANGUAGE")
print("Total of extracted languages: ", total_language_calling)
print("Percentage of extracted languages: ", (total_language_calling/total_entities_calling)*100)
print("Total of unique languages: ", unique_language_calling)
print("Percentage of unique languages: ", (unique_language_calling/total_language_calling)*100)
print("\nLAW")
print("Total of extracted laws: ", total_law_calling)
print("Percentage of extracted laws: ", (total_law_calling/total_entities_calling)*100)
print("Total of unique laws: ", unique_law_calling)
print("Percentage of unique laws: ", (unique_law_calling/total_law_calling)*100)
print("\nLOC")
print("Total of extracted generic locations: ", total_loc_calling)
print("Percentage of extracted generic locations: ", (total_loc_calling/total_entities_calling)*100)
print("Total of unique generic locations: ", unique_loc_calling)
print("Percentage of unique generic locations: ", (unique_loc_calling/total_loc_calling)*100)
print("\nMONEY")
print("Total of extracted money values: ", total_money_calling)
print("Percentage of extracted money values: ", (total_money_calling/total_entities_calling)*100)
print("Total of unique money values: ", unique_money_calling)
print("Percentage of unique money values: ", (unique_money_calling/total_money_calling)*100)
print("\nNORP")
print("Total of extracted nationalities, religious or political groups: ", total_norp_calling)
print("Percentage of extracted nationalities, religious or political groups: ", (total_norp_calling/total_entities_calling)*100)
print("Total of unique nationalities, religious or political groups: ", unique_norp_calling)
print("Percentage of unique nationalities, religious or political groups: ", (unique_norp_calling/total_norp_calling)*100)
print("\nORDINAL")
print("Total of extracted ordinal numbers: ", total_ordinal_calling)
print("Percentage extracted ordinal numbers: ", (total_ordinal_loaded/total_entities_calling)*100)
print("Total of unique ordinal numbers: ", unique_ordinal_calling)
print("Percentage of unique ordinal numbers: ", (unique_ordinal_calling/total_ordinal_calling)*100)
print("\nORG")
print("Total of extracted companies or organizations: ", total_org_calling)
print("Percentage extracted companies or organizations: ", (total_org_calling/total_entities_calling)*100)
print("Total of unique companies or organizations: ", unique_org_calling)
print("Percentage of unique companies or organizations: ", (unique_org_calling/total_org_calling)*100)
print("\nPERCENT")
print("Total of extracted percentages: ", total_percent_calling)
print("Percentage extracted percentages: ", (total_percent_calling/total_entities_calling)*100)
print("Total of unique percentages: ", unique_percent_calling)
print("Percentage of unique percentages: ", (unique_percent_calling/total_percent_calling)*100)
print("\nPERSON")
print("Total of extracted people: ", total_person_calling)
print("Percentage extracted people: ", (total_person_calling/total_entities_calling)*100)
print("Total of unique people: ", unique_person_calling)
print("Percentage of unique people: ", (unique_person_calling/total_person_calling)*100)
print("\nPRODUCT")
print("Total of extracted products: ", total_product_calling)
print("Percentage extracted products: ", (total_product_calling/total_entities_calling)*100)
print("Total of unique products: ", unique_product_calling)
print("Percentage of unique products: ", (unique_product_calling/total_product_calling)*100)
print("\nTIME")
print("Total of extracted times: ", total_time_calling)
print("Percentage of extracted times: ", (total_time_calling/total_entities_calling)*100)
print("Total of unique times: ", unique_time_calling)
print("Percentage of unique times: ", (unique_time_calling/total_time_calling)*100)
print("\nWORK OF ART")
print("Total of extracted works of art: ", total_woa_calling)
print("Percentage of extracted works of art: ", (total_woa_calling/total_entities_calling)*100)
print("Total of unique works of art: ", unique_woa_calling)
print("Percentage of unique works of art: ", (unique_woa_calling/total_woa_calling)*100)

CARDINAL
Total of extracted cardinal numbers:  62
Percentage of extracted cardinal numbers:  4.679245283018868
Total of unique cardinal numbers:  30
Percentage of unique cardinal numbers:  48.38709677419355

DATE
Total of extracted dates:  149
Percentage of extracted dates:  11.245283018867925
Total of unique dates:  111
Percentage of unique dates:  74.49664429530202

EVENT
Total of extracted events:  7
Percentage of extracted events:  0.5283018867924528
Total of unique events:  5
Percentage of unique events:  71.42857142857143

FAC
Total of extracted facilities:  8
Percentage of extracted facilities:  0.6037735849056604
Total of unique facilities:  4
Percentage of unique facilities:  50.0

GPE
Total of extracted countries or cities:  165
Percentage of extracted countries or cities:  12.452830188679245
Total of unique countries or cities:  85
Percentage of unique countries or cities:  51.515151515151516

LANGUAGE
Total of extracted languages:  3
Percentage of extracted languages:  0.22

In [247]:
calling_ents_count = Counter()

for ent in total_calling_text.ents:
    calling_ents_count[f"{ent.label_}: {ent.text}"] += 1

print("MOST COMMON ENTITIES")
for key, val in calling_ents_count.most_common(30):
    print(val, key, sep="\t")

MOST COMMON ENTITIES
21	ORG: Trump
15	ORG: FBI
15	CARDINAL: one
12	GPE: Iran
12	PERSON: Francis
10	GPE: US
10	ORG: Church
10	ORG: Assange
9	ORG: Guardian
9	NORP: Muslim
9	NORP: Islamic
8	PERSON: Benedict
7	PERSON: Trump
7	DATE: today
7	ORG: Google
6	PERSON: Peter Strzok
6	PERSON: Pope
6	NORP: Catholic
6	GPE: Russia
5	CARDINAL: two
5	ORG: Kavanaugh
5	PERSON: Lisa Page
5	NORP: Republican
5	NORP: Russian
5	PERSON: Clinton
5	GPE: Syria
5	ORG: CIA
5	NORP: Catholics
5	ORDINAL: third
5	FAC: Vatican


In [248]:
calling_cardinal_count = Counter()
calling_date_count = Counter()
calling_event_count = Counter()
calling_fac_count = Counter()
calling_gpe_count = Counter()
calling_language_count = Counter()
calling_law_count = Counter()
calling_loc_count = Counter()
calling_money_count = Counter()
calling_norp_count = Counter()
calling_ordinal_count = Counter()
calling_org_count = Counter()
calling_percent_count = Counter()
calling_person_count = Counter()
calling_product_count = Counter()
calling_time_count = Counter()
calling_woa_count = Counter()

for ent in total_calling_text.ents:
    if (ent.label_ == "CARDINAL"):
        calling_cardinal_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "DATE"):
        calling_date_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "EVENT"):
        calling_event_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "FAC"):
        calling_fac_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "GPE"):
        calling_gpe_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LANGUAGE"):
        calling_language_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LAW"):
        calling_law_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LOC"):
        calling_loc_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "MONEY"):
        calling_money_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "NORP"):
        calling_norp_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORDINAL"):
        calling_ordinal_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORG"):
        calling_org_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PERCENT"):
        calling_percent_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "PERSON"):
        calling_person_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PRODUCT"):
        calling_product_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "TIME"):
        calling_time_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "WORK_OF_ART"):
        calling_woa_count[f"{ent.label_}: {ent.text}"] += 1 

In [249]:
print("MOST COMMON CARDINALS")        
for key, val in calling_cardinal_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON DATES")        
for key, val in calling_date_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON EVENTS")        
for key, val in calling_event_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON FACILITIES")        
for key, val in calling_fac_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON COUNTRIES OR CITIES")        
for key, val in calling_gpe_count.most_common(15):
    print(val, key, sep="\t")  

print("\nMOST COMMON LANGUAGES")        
for key, val in calling_language_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LAWS")        
for key, val in calling_law_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LOCATIONS")        
for key, val in calling_loc_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON MONEY VALUES")        
for key, val in calling_money_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON NATIONALITIES OR GROUPS")        
for key, val in calling_norp_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON ORDINALS")        
for key, val in calling_ordinal_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON ORGANIZATIONS")        
for key, val in calling_org_count.most_common(20):
    print(val, key, sep="\t")  

print("\nMOST COMMON PERCENTAGES")        
for key, val in calling_percent_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON PEOPLE")        
for key, val in calling_person_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON PRODUCTS")        
for key, val in calling_product_count.most_common(10):
    print(val, key, sep="\t")    
    
print("\nMOST COMMON TIMES")        
for key, val in calling_time_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON WORK OF ARTS")        
for key, val in calling_woa_count.most_common(10):
    print(val, key, sep="\t")

MOST COMMON CARDINALS
15	CARDINAL: one
5	CARDINAL: two
5	CARDINAL: 11
3	CARDINAL: three
2	CARDINAL: at least one
2	CARDINAL: Two
2	CARDINAL: One
2	CARDINAL: Eleven
2	CARDINAL: 24:31
2	CARDINAL: 33:59

MOST COMMON DATES
7	DATE: today
4	DATE: yesterday
4	DATE: Friday
4	DATE: 2015
4	DATE: Thursday
3	DATE: Sunday
3	DATE: Monday
2	DATE: February 2016
2	DATE: decades
2	DATE: this year

MOST COMMON EVENTS
3	EVENT: Holocaust
1	EVENT: WWII
1	EVENT: World Over
1	EVENT: World War
1	EVENT: the “World Meeting of Families

MOST COMMON FACILITIES
5	FAC: Vatican
1	FAC: La Stampa
1	FAC: the White House
1	FAC: the Fourth Amendment

MOST COMMON COUNTRIES OR CITIES
12	GPE: Iran
10	GPE: US
6	GPE: Russia
5	GPE: Syria
5	GPE: Rome
5	GPE: Georgia
4	GPE: Israel
4	GPE: U.S.
4	GPE: the United States
4	GPE: America
4	GPE: Chile
4	GPE: Dallas
3	GPE: Barros
3	GPE: Orbán
3	GPE: Newark

MOST COMMON LANGUAGES
2	LANGUAGE: English
1	LANGUAGE: Arabic

MOST COMMON LAWS
1	LAW: Chapter 8
1	LAW: Trump’s
1	LAW: First Amendment

### Repetition

This first section will focus on the extraction of the information described above focusing on the Repetition class.

In [250]:
total_entities_dict_repetition = {key: list(g) for key, g in groupby(sorted(total_repetition_text.ents, key=lambda x: x.label_), lambda x: x.label_)}
print("ALL ENTITIES")
print("All categories extracted: ", total_entities_dict_repetition.keys())
print("Number of categories extracted: ", len(total_entities_dict_repetition))

total_entities_value_list_repetition = list()
for i in total_entities_dict_repetition.values():
    total_entities_value_list_repetition.append(i)
    
total_entities_repetition= len(sum(total_entities_value_list_repetition, []))
print("Total number of entities: ", total_entities_repetition)

unique_entities_dict_repetition = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(total_repetition_text.ents, key=lambda x: x.label_), lambda x: x.label_)}

unique_entities_value_list_repetition = list()
for i in unique_entities_dict_repetition.values():
    unique_entities_value_list_repetition.append(i)
    
unique_entities_repetition= len(sum(unique_entities_value_list_repetition, []))
print("Total number of unique entities: ", unique_entities_repetition)
print("Percentage of unique entities: ", (unique_entities_repetition/total_entities_repetition)*100)

ALL ENTITIES
All categories extracted:  dict_keys(['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART'])
Number of categories extracted:  18
Total number of entities:  849
Total number of unique entities:  551
Percentage of unique entities:  64.89988221436984


In [251]:
# Counting the total occurrences for each label
total_cardinal_repetition = len(total_entities_dict_repetition['CARDINAL'])
total_date_repetition = len(total_entities_dict_repetition['DATE'])
total_event_repetition = len(total_entities_dict_repetition['EVENT'])
total_fac_repetition = len(total_entities_dict_repetition['FAC'])
total_gpe_repetition = len(total_entities_dict_repetition['GPE'])
total_language_repetition = len(total_entities_dict_repetition['LANGUAGE'])
total_law_repetition = len(total_entities_dict_repetition['LAW'])
total_loc_repetition = len(total_entities_dict_repetition['LOC'])
total_money_repetition = len(total_entities_dict_repetition['MONEY'])
total_norp_repetition = len(total_entities_dict_repetition['NORP'])
total_ordinal_repetition = len(total_entities_dict_repetition['ORDINAL'])
total_org_repetition = len(total_entities_dict_repetition['ORG'])
total_percent_repetition = len(total_entities_dict_repetition['PERCENT'])
total_person_repetition = len(total_entities_dict_repetition['PERSON'])
total_product_repetition = len(total_entities_dict_repetition['PRODUCT'])
total_quantity_repetition = len(total_entities_dict_repetition['QUANTITY'])
total_time_repetition = len(total_entities_dict_repetition['TIME'])
total_woa_repetition = len(total_entities_dict_repetition['WORK_OF_ART'])

In [252]:
# Counting the unique occurrences for each label
unique_cardinal_repetition = len(unique_entities_dict_repetition['CARDINAL'])
unique_date_repetition = len(unique_entities_dict_repetition['DATE'])
unique_event_repetition = len(unique_entities_dict_repetition['EVENT'])
unique_fac_repetition = len(unique_entities_dict_repetition['FAC'])
unique_gpe_repetition = len(unique_entities_dict_repetition['GPE'])
unique_language_repetition = len(unique_entities_dict_repetition['LANGUAGE'])
unique_law_repetition = len(unique_entities_dict_repetition['LAW'])
unique_loc_repetition = len(unique_entities_dict_repetition['LOC'])
unique_money_repetition = len(unique_entities_dict_repetition['MONEY'])
unique_norp_repetition = len(unique_entities_dict_repetition['NORP'])
unique_ordinal_repetition = len(unique_entities_dict_repetition['ORDINAL'])
unique_org_repetition = len(unique_entities_dict_repetition['ORG'])
unique_percent_repetition = len(unique_entities_dict_repetition['PERCENT'])
unique_person_repetition = len(unique_entities_dict_repetition['PERSON'])
unique_product_repetition = len(unique_entities_dict_repetition['PRODUCT'])
unique_quantity_repetition = len(unique_entities_dict_repetition['QUANTITY'])
unique_time_repetition = len(unique_entities_dict_repetition['TIME'])
unique_woa_repetition = len(unique_entities_dict_repetition['WORK_OF_ART'])

In [253]:
print("CARDINAL")
print("Total of extracted cardinal numbers: ", total_cardinal_repetition)
print("Percentage of extracted cardinal numbers: ", (total_cardinal_repetition/total_entities_repetition)*100)
print("Total of unique cardinal numbers: ", unique_cardinal_repetition)
print("Percentage of unique cardinal numbers: ", (unique_cardinal_repetition/total_cardinal_repetition)*100)
print("\nDATE")
print("Total of extracted dates: ", total_date_repetition)
print("Percentage of extracted dates: ", (total_date_repetition/total_entities_repetition)*100)
print("Total of unique dates: ", unique_date_repetition)
print("Percentage of unique dates: ", (unique_date_repetition/total_date_repetition)*100)
print("\nEVENT")
print("Total of extracted events: ", total_event_repetition)
print("Percentage of extracted events: ", (total_event_repetition/total_entities_repetition)*100)
print("Total of unique events: ", unique_event_repetition)
print("Percentage of unique events: ", (unique_event_repetition/total_event_repetition)*100)
print("\nFAC")
print("Total of extracted facilities: ", total_fac_repetition)
print("Percentage of extracted facilities: ", (total_fac_repetition/total_entities_repetition)*100)
print("Total of unique facilities: ", unique_fac_repetition)
print("Percentage of unique facilities: ", (unique_fac_repetition/total_fac_repetition)*100)
print("\nGPE")
print("Total of extracted countries or cities: ", total_gpe_repetition)
print("Percentage of extracted countries or cities: ", (total_gpe_repetition/total_entities_repetition)*100)
print("Total of unique countries or cities: ", unique_gpe_repetition)
print("Percentage of unique countries or cities: ", (unique_gpe_repetition/total_gpe_repetition)*100)
print("\nLANGUAGE")
print("Total of extracted languages: ", total_language_repetition)
print("Percentage of extracted languages: ", (total_language_repetition/total_entities_repetition)*100)
print("Total of unique languages: ", unique_language_repetition)
print("Percentage of unique languages: ", (unique_language_repetition/total_language_repetition)*100)
print("\nLAW")
print("Total of extracted laws: ", total_law_repetition)
print("Percentage of extracted laws: ", (total_law_repetition/total_entities_repetition)*100)
print("Total of unique laws: ", unique_law_repetition)
print("Percentage of unique laws: ", (unique_law_repetition/total_law_repetition)*100)
print("\nLOC")
print("Total of extracted generic locations: ", total_loc_repetition)
print("Percentage of extracted generic locations: ", (total_loc_repetition/total_entities_repetition)*100)
print("Total of unique generic locations: ", unique_loc_repetition)
print("Percentage of unique generic locations: ", (unique_loc_repetition/total_loc_repetition)*100)
print("\nMONEY")
print("Total of extracted money values: ", total_money_repetition)
print("Percentage of extracted money values: ", (total_money_repetition/total_entities_repetition)*100)
print("Total of unique money values: ", unique_money_repetition)
print("Percentage of unique money values: ", (unique_money_repetition/total_money_repetition)*100)
print("\nNORP")
print("Total of extracted nationalities, religious or political groups: ", total_norp_repetition)
print("Percentage of extracted nationalities, religious or political groups: ", (total_norp_repetition/total_entities_repetition)*100)
print("Total of unique nationalities, religious or political groups: ", unique_norp_repetition)
print("Percentage of unique nationalities, religious or political groups: ", (unique_norp_repetition/total_norp_repetition)*100)
print("\nORDINAL")
print("Total of extracted ordinal numbers: ", total_ordinal_repetition)
print("Percentage extracted ordinal numbers: ", (total_ordinal_repetition/total_entities_repetition)*100)
print("Total of unique ordinal numbers: ", unique_ordinal_repetition)
print("Percentage of unique ordinal numbers: ", (unique_ordinal_repetition/total_ordinal_repetition)*100)
print("\nORG")
print("Total of extracted companies or organizations: ", total_org_repetition)
print("Percentage extracted companies or organizations: ", (total_org_repetition/total_entities_repetition)*100)
print("Total of unique companies or organizations: ", unique_org_repetition)
print("Percentage of unique companies or organizations: ", (unique_org_repetition/total_org_repetition)*100)
print("\nPERCENT")
print("Total of extracted percentages: ", total_percent_repetition)
print("Percentage extracted percentages: ", (total_percent_repetition/total_entities_repetition)*100)
print("Total of unique percentages: ", unique_percent_repetition)
print("Percentage of unique percentages: ", (unique_percent_repetition/total_percent_repetition)*100)
print("\nPERSON")
print("Total of extracted people: ", total_person_repetition)
print("Percentage extracted people: ", (total_person_repetition/total_entities_repetition)*100)
print("Total of unique people: ", unique_person_repetition)
print("Percentage of unique people: ", (unique_person_repetition/total_person_repetition)*100)
print("\nPRODUCT")
print("Total of extracted products: ", total_product_repetition)
print("Percentage extracted products: ", (total_product_repetition/total_entities_repetition)*100)
print("Total of unique products: ", unique_product_repetition)
print("Percentage of unique products: ", (unique_product_repetition/total_product_repetition)*100)
print("\nQUANTITY")
print("Total of extracted measurements: ", total_quantity_repetition)
print("Percentage extracted measurements: ", (total_quantity_repetition/total_entities_repetition)*100)
print("Total of unique measurements: ", unique_quantity_repetition)
print("Percentage of unique measurements: ", (unique_quantity_repetition/total_quantity_repetition)*100)
print("\nTIME")
print("Total of extracted times: ", total_time_repetition)
print("Percentage of extracted times: ", (total_time_repetition/total_entities_repetition)*100)
print("Total of unique times: ", unique_time_repetition)
print("Percentage of unique times: ", (unique_time_repetition/total_time_repetition)*100)
print("\nWORK OF ART")
print("Total of extracted works of art: ", total_woa_repetition)
print("Percentage of extracted works of art: ", (total_woa_repetition/total_entities_repetition)*100)
print("Total of unique works of art: ", unique_woa_repetition)
print("Percentage of unique works of art: ", (unique_woa_repetition/total_woa_repetition)*100)

CARDINAL
Total of extracted cardinal numbers:  62
Percentage of extracted cardinal numbers:  7.302709069493522
Total of unique cardinal numbers:  35
Percentage of unique cardinal numbers:  56.451612903225815

DATE
Total of extracted dates:  94
Percentage of extracted dates:  11.071849234393403
Total of unique dates:  77
Percentage of unique dates:  81.91489361702128

EVENT
Total of extracted events:  1
Percentage of extracted events:  0.11778563015312131
Total of unique events:  1
Percentage of unique events:  100.0

FAC
Total of extracted facilities:  12
Percentage of extracted facilities:  1.4134275618374559
Total of unique facilities:  7
Percentage of unique facilities:  58.333333333333336

GPE
Total of extracted countries or cities:  154
Percentage of extracted countries or cities:  18.138987043580684
Total of unique countries or cities:  63
Percentage of unique countries or cities:  40.909090909090914

LANGUAGE
Total of extracted languages:  1
Percentage of extracted languages:  0

In [254]:
repetition_ents_count = Counter()

for ent in total_repetition_text.ents:
    repetition_ents_count[f"{ent.label_}: {ent.text}"] += 1

print("MOST COMMON ENTITIES")
for key, val in repetition_ents_count.most_common(30):
    print(val, key, sep="\t")

MOST COMMON ENTITIES
21	GPE: US
13	CARDINAL: one
11	GPE: Iran
10	ORG: Trump
10	ORG: FBI
10	ORG: Church
9	GPE: Russia
9	ORG: Council
8	NORP: Russian
8	PERSON: Trump
7	ORG: Islam
7	CARDINAL: two
6	GPE: U.S.
6	NORP: Muslim
6	GPE: Rome
6	FAC: Vatican
5	ORG: the Justice Department
5	GPE: Syria
5	LOC: Europe
5	GPE: the United States
5	ORDINAL: first
5	GPE: China
5	GPE: Cuba
5	PERSON: Jean
5	PERSON: Perez
4	DATE: 2016
4	PERSON: Francis
4	PERSON: Saris
4	PERSON: Putin
4	DATE: today


In [255]:
repetition_cardinal_count = Counter()
repetition_date_count = Counter()
repetition_event_count = Counter()
repetition_fac_count = Counter()
repetition_gpe_count = Counter()
repetition_language_count = Counter()
repetition_law_count = Counter()
repetition_loc_count = Counter()
repetition_money_count = Counter()
repetition_norp_count = Counter()
repetition_ordinal_count = Counter()
repetition_org_count = Counter()
repetition_percent_count = Counter()
repetition_person_count = Counter()
repetition_product_count = Counter()
repetition_quantity_count = Counter()
repetition_time_count = Counter()
repetition_woa_count = Counter()

for ent in total_repetition_text.ents:
    if (ent.label_ == "CARDINAL"):
        repetition_cardinal_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "DATE"):
        repetition_date_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "EVENT"):
        repetition_event_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "FAC"):
        repetition_fac_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "GPE"):
        repetition_gpe_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LANGUAGE"):
        repetition_language_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LAW"):
        repetition_law_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LOC"):
        repetition_loc_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "MONEY"):
        repetition_money_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "NORP"):
        repetition_norp_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORDINAL"):
        repetition_ordinal_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORG"):
        repetition_org_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PERCENT"):
        repetition_percent_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "PERSON"):
        repetition_person_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PRODUCT"):
        repetition_product_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "QUANTITY"):
        repetition_quantity_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "TIME"):
        repetition_time_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "WORK_OF_ART"):
        repetition_woa_count[f"{ent.label_}: {ent.text}"] += 1 

In [256]:
print("MOST COMMON CARDINALS")        
for key, val in repetition_cardinal_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON DATES")        
for key, val in repetition_date_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON EVENTS")        
for key, val in repetition_event_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON FACILITIES")        
for key, val in repetition_fac_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON COUNTRIES OR CITIES")        
for key, val in repetition_gpe_count.most_common(15):
    print(val, key, sep="\t")  

print("\nMOST COMMON LANGUAGES")        
for key, val in repetition_language_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LAWS")        
for key, val in repetition_law_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LOCATIONS")        
for key, val in repetition_loc_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON MONEY VALUES")        
for key, val in repetition_money_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON NATIONALITIES OR GROUPS")        
for key, val in repetition_norp_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON ORDINALS")        
for key, val in repetition_ordinal_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON ORGANIZATIONS")        
for key, val in repetition_org_count.most_common(20):
    print(val, key, sep="\t")  

print("\nMOST COMMON PERCENTAGES")        
for key, val in repetition_percent_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON PEOPLE")        
for key, val in repetition_person_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON PRODUCTS")        
for key, val in repetition_product_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON QUANTITIES")        
for key, val in repetition_quantity_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON TIMES")        
for key, val in repetition_time_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON WORK OF ARTS")        
for key, val in repetition_woa_count.most_common(10):
    print(val, key, sep="\t")

MOST COMMON CARDINALS
13	CARDINAL: one
7	CARDINAL: two
4	CARDINAL: three
3	CARDINAL: #
3	CARDINAL: One
2	CARDINAL: 911
2	CARDINAL: six
1	CARDINAL: 2011.Kim
1	CARDINAL: more than 1,300
1	CARDINAL: almost 1 million

MOST COMMON DATES
4	DATE: 2016
4	DATE: today
4	DATE: 2017
3	DATE: last year
3	DATE: 2009
2	DATE: 2010
2	DATE: the years
2	DATE: 5 years
2	DATE: the coming days
1	DATE: this year

MOST COMMON EVENTS
1	EVENT: the US Constitution

MOST COMMON FACILITIES
6	FAC: Vatican
1	FAC: Alexandria’s
1	FAC: St. Cyr
1	FAC: EOIR
1	FAC: Vatican II
1	FAC: Reva Street
1	FAC: the Otay Mesa Port of Entry

MOST COMMON COUNTRIES OR CITIES
21	GPE: US
11	GPE: Iran
9	GPE: Russia
6	GPE: U.S.
6	GPE: Rome
5	GPE: Syria
5	GPE: the United States
5	GPE: China
5	GPE: Cuba
4	GPE: Paris
3	GPE: Israel
3	GPE: Indonesia
3	GPE: Sara
3	GPE: Las Vegas
3	GPE: San Antonio

MOST COMMON LANGUAGES
1	LANGUAGE: English

MOST COMMON LAWS
1	LAW: Sharia

MOST COMMON LOCATIONS
5	LOC: Europe
2	LOC: Earth
1	LOC: Saris
1	LOC: Mass
1

### Doubt

This first section will focus on the extraction of the information described above focusing on the Doubt class.

In [257]:
total_entities_dict_doubt = {key: list(g) for key, g in groupby(sorted(total_doubt_text.ents, key=lambda x: x.label_), lambda x: x.label_)}
print("ALL ENTITIES")
print("All categories extracted: ", total_entities_dict_doubt.keys())
print("Number of categories extracted: ", len(total_entities_dict_doubt))

total_entities_value_list_doubt = list()
for i in total_entities_dict_doubt.values():
    total_entities_value_list_doubt.append(i)
    
total_entities_doubt= len(sum(total_entities_value_list_doubt, []))
print("Total number of entities: ", total_entities_doubt)

unique_entities_dict_doubt = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(total_doubt_text.ents, key=lambda x: x.label_), lambda x: x.label_)}

unique_entities_value_list_doubt = list()
for i in unique_entities_dict_doubt.values():
    unique_entities_value_list_doubt.append(i)
    
unique_entities_doubt = len(sum(unique_entities_value_list_doubt, []))
print("Total number of unique entities: ", unique_entities_doubt)
print("Percentage of unique entities: ", (unique_entities_doubt/total_entities_doubt)*100)

ALL ENTITIES
All categories extracted:  dict_keys(['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LAW', 'LOC', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'TIME', 'WORK_OF_ART'])
Number of categories extracted:  15
Total number of entities:  820
Total number of unique entities:  489
Percentage of unique entities:  59.63414634146341


In [258]:
# Counting the total occurrences for each label
total_cardinal_doubt = len(total_entities_dict_doubt['CARDINAL'])
total_date_doubt = len(total_entities_dict_doubt['DATE'])
total_event_doubt = len(total_entities_dict_doubt['EVENT'])
total_fac_doubt = len(total_entities_dict_doubt['FAC'])
total_gpe_doubt = len(total_entities_dict_doubt['GPE'])
total_law_doubt = len(total_entities_dict_doubt['LAW'])
total_loc_doubt = len(total_entities_dict_doubt['LOC'])
total_norp_doubt = len(total_entities_dict_doubt['NORP'])
total_ordinal_doubt = len(total_entities_dict_doubt['ORDINAL'])
total_org_doubt = len(total_entities_dict_doubt['ORG'])
total_percent_doubt = len(total_entities_dict_doubt['PERCENT'])
total_person_doubt = len(total_entities_dict_doubt['PERSON'])
total_product_doubt = len(total_entities_dict_doubt['PRODUCT'])
total_time_doubt = len(total_entities_dict_doubt['TIME'])
total_woa_doubt = len(total_entities_dict_doubt['WORK_OF_ART'])

In [259]:
# Counting the unique occurrences for each label
unique_cardinal_doubt = len(unique_entities_dict_doubt['CARDINAL'])
unique_date_doubt = len(unique_entities_dict_doubt['DATE'])
unique_event_doubt = len(unique_entities_dict_doubt['EVENT'])
unique_fac_doubt = len(unique_entities_dict_doubt['FAC'])
unique_gpe_doubt = len(unique_entities_dict_doubt['GPE'])
unique_law_doubt = len(unique_entities_dict_doubt['LAW'])
unique_loc_doubt = len(unique_entities_dict_doubt['LOC'])
unique_norp_doubt = len(unique_entities_dict_doubt['NORP'])
unique_ordinal_doubt = len(unique_entities_dict_doubt['ORDINAL'])
unique_org_doubt = len(unique_entities_dict_doubt['ORG'])
unique_percent_doubt = len(unique_entities_dict_doubt['PERCENT'])
unique_person_doubt = len(unique_entities_dict_doubt['PERSON'])
unique_product_doubt = len(unique_entities_dict_doubt['PRODUCT'])
unique_time_doubt = len(unique_entities_dict_doubt['TIME'])
unique_woa_doubt = len(unique_entities_dict_doubt['WORK_OF_ART'])

In [260]:
print("CARDINAL")
print("Total of extracted cardinal numbers: ", total_cardinal_doubt)
print("Percentage of extracted cardinal numbers: ", (total_cardinal_doubt/total_entities_doubt)*100)
print("Total of unique cardinal numbers: ", unique_cardinal_doubt)
print("Percentage of unique cardinal numbers: ", (unique_cardinal_doubt/total_cardinal_doubt)*100)
print("\nDATE")
print("Total of extracted dates: ", total_date_doubt)
print("Percentage of extracted dates: ", (total_date_doubt/total_entities_doubt)*100)
print("Total of unique dates: ", unique_date_doubt)
print("Percentage of unique dates: ", (unique_date_doubt/total_date_doubt)*100)
print("\nEVENT")
print("Total of extracted events: ", total_event_doubt)
print("Percentage of extracted events: ", (total_event_doubt/total_entities_doubt)*100)
print("Total of unique events: ", unique_event_doubt)
print("Percentage of unique events: ", (unique_event_doubt/total_event_doubt)*100)
print("\nFAC")
print("Total of extracted facilities: ", total_fac_doubt)
print("Percentage of extracted facilities: ", (total_fac_doubt/total_entities_doubt)*100)
print("Total of unique facilities: ", unique_fac_doubt)
print("Percentage of unique facilities: ", (unique_fac_doubt/total_fac_doubt)*100)
print("\nGPE")
print("Total of extracted countries or cities: ", total_gpe_doubt)
print("Percentage of extracted countries or cities: ", (total_gpe_doubt/total_entities_doubt)*100)
print("Total of unique countries or cities: ", unique_gpe_doubt)
print("Percentage of unique countries or cities: ", (unique_gpe_doubt/total_gpe_doubt)*100)
print("\nLAW")
print("Total of extracted laws: ", total_law_doubt)
print("Percentage of extracted laws: ", (total_law_doubt/total_entities_doubt)*100)
print("Total of unique laws: ", unique_law_doubt)
print("Percentage of unique laws: ", (unique_law_doubt/total_law_doubt)*100)
print("\nLOC")
print("Total of extracted generic locations: ", total_loc_doubt)
print("Percentage of extracted generic locations: ", (total_loc_doubt/total_entities_doubt)*100)
print("Total of unique generic locations: ", unique_loc_doubt)
print("Percentage of unique generic locations: ", (unique_loc_doubt/total_loc_doubt)*100)
print("\nNORP")
print("Total of extracted nationalities, religious or political groups: ", total_norp_doubt)
print("Percentage of extracted nationalities, religious or political groups: ", (total_norp_doubt/total_entities_doubt)*100)
print("Total of unique nationalities, religious or political groups: ", unique_norp_doubt)
print("Percentage of unique nationalities, religious or political groups: ", (unique_norp_doubt/total_norp_doubt)*100)
print("\nORDINAL")
print("Total of extracted ordinal numbers: ", total_ordinal_doubt)
print("Percentage extracted ordinal numbers: ", (total_ordinal_doubt/total_entities_doubt)*100)
print("Total of unique ordinal numbers: ", unique_ordinal_doubt)
print("Percentage of unique ordinal numbers: ", (unique_ordinal_doubt/total_ordinal_doubt)*100)
print("\nORG")
print("Total of extracted companies or organizations: ", total_org_doubt)
print("Percentage extracted companies or organizations: ", (total_org_doubt/total_entities_doubt)*100)
print("Total of unique companies or organizations: ", unique_org_doubt)
print("Percentage of unique companies or organizations: ", (unique_org_doubt/total_org_doubt)*100)
print("\nPERCENT")
print("Total of extracted percentages: ", total_percent_doubt)
print("Percentage extracted percentages: ", (total_percent_doubt/total_entities_doubt)*100)
print("Total of unique percentages: ", unique_percent_doubt)
print("Percentage of unique percentages: ", (unique_percent_doubt/total_percent_doubt)*100)
print("\nPERSON")
print("Total of extracted people: ", total_person_doubt)
print("Percentage extracted people: ", (total_person_doubt/total_entities_doubt)*100)
print("Total of unique people: ", unique_person_doubt)
print("Percentage of unique people: ", (unique_person_doubt/total_person_doubt)*100)
print("\nPRODUCT")
print("Total of extracted products: ", total_product_doubt)
print("Percentage extracted products: ", (total_product_doubt/total_entities_doubt)*100)
print("Total of unique products: ", unique_product_doubt)
print("Percentage of unique products: ", (unique_product_doubt/total_product_doubt)*100)
print("\nTIME")
print("Total of extracted times: ", total_time_doubt)
print("Percentage of extracted times: ", (total_time_doubt/total_entities_doubt)*100)
print("Total of unique times: ", unique_time_doubt)
print("Percentage of unique times: ", (unique_time_doubt/total_time_doubt)*100)
print("\nWORK OF ART")
print("Total of extracted works of art: ", total_woa_doubt)
print("Percentage of extracted works of art: ", (total_woa_doubt/total_entities_doubt)*100)
print("Total of unique works of art: ", unique_woa_doubt)
print("Percentage of unique works of art: ", (unique_woa_doubt/total_woa_doubt)*100)

CARDINAL
Total of extracted cardinal numbers:  51
Percentage of extracted cardinal numbers:  6.219512195121951
Total of unique cardinal numbers:  33
Percentage of unique cardinal numbers:  64.70588235294117

DATE
Total of extracted dates:  99
Percentage of extracted dates:  12.073170731707316
Total of unique dates:  78
Percentage of unique dates:  78.78787878787878

EVENT
Total of extracted events:  4
Percentage of extracted events:  0.4878048780487805
Total of unique events:  4
Percentage of unique events:  100.0

FAC
Total of extracted facilities:  8
Percentage of extracted facilities:  0.975609756097561
Total of unique facilities:  5
Percentage of unique facilities:  62.5

GPE
Total of extracted countries or cities:  112
Percentage of extracted countries or cities:  13.658536585365855
Total of unique countries or cities:  60
Percentage of unique countries or cities:  53.57142857142857

LAW
Total of extracted laws:  7
Percentage of extracted laws:  0.853658536585366
Total of unique l

In [261]:
doubt_ents_count = Counter()

for ent in total_doubt_text.ents:
    doubt_ents_count[f"{ent.label_}: {ent.text}"] += 1

print("MOST COMMON ENTITIES")
for key, val in doubt_ents_count.most_common(30):
    print(val, key, sep="\t")

MOST COMMON ENTITIES
17	PERSON: Campos
12	GPE: Las Vegas
12	ORG: FBI
11	PERSON: Paddock
10	ORG: Trump
10	LOC: Mandalay Bay
9	PERSON: Francis
9	CARDINAL: one
9	ORG: Assange
9	ORG: Guardian
8	GPE: Iran
8	GPE: Syria
8	ORDINAL: first
8	PERSON: Ford
7	CARDINAL: two
7	ORG: Lambert
6	ORG: ISIS
6	PERSON: Lombardo
6	DATE: Friday
6	GPE: U.S.
6	ORG: UN
6	ORG: Ford
6	PERSON: Gillum
5	ORG: Kavanaugh
5	DATE: Thursday
5	PERSON: Stephen Paddock
5	ORG: TENEX
5	GPE: Russia
4	ORG: White House
4	TIME: 9:59 p.m.


In [262]:
doubt_cardinal_count = Counter()
doubt_date_count = Counter()
doubt_event_count = Counter()
doubt_fac_count = Counter()
doubt_gpe_count = Counter()
doubt_law_count = Counter()
doubt_loc_count = Counter()
doubt_norp_count = Counter()
doubt_ordinal_count = Counter()
doubt_org_count = Counter()
doubt_percent_count = Counter()
doubt_person_count = Counter()
doubt_product_count = Counter()
doubt_time_count = Counter()
doubt_woa_count = Counter()

for ent in total_doubt_text.ents:
    if (ent.label_ == "CARDINAL"):
        doubt_cardinal_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "DATE"):
        doubt_date_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "EVENT"):
        doubt_event_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "FAC"):
        doubt_fac_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "GPE"):
        doubt_gpe_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LAW"):
        doubt_law_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LOC"):
        doubt_loc_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "NORP"):
        doubt_norp_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORDINAL"):
        doubt_ordinal_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORG"):
        doubt_org_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PERCENT"):
        doubt_percent_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "PERSON"):
        doubt_person_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PRODUCT"):
        doubt_product_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "TIME"):
        doubt_time_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "WORK_OF_ART"):
        doubt_woa_count[f"{ent.label_}: {ent.text}"] += 1 

In [263]:
print("MOST COMMON CARDINALS")        
for key, val in doubt_cardinal_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON DATES")        
for key, val in doubt_date_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON EVENTS")        
for key, val in doubt_event_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON FACILITIES")        
for key, val in doubt_fac_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON COUNTRIES OR CITIES")        
for key, val in doubt_gpe_count.most_common(15):
    print(val, key, sep="\t")  

print("\nMOST COMMON LAWS")        
for key, val in doubt_law_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LOCATIONS")        
for key, val in doubt_loc_count.most_common(10):
    print(val, key, sep="\t") 

print("\nMOST COMMON NATIONALITIES OR GROUPS")        
for key, val in doubt_norp_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON ORDINALS")        
for key, val in doubt_ordinal_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON ORGANIZATIONS")        
for key, val in doubt_org_count.most_common(20):
    print(val, key, sep="\t")  

print("\nMOST COMMON PERCENTAGES")        
for key, val in doubt_percent_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON PEOPLE")        
for key, val in doubt_person_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON PRODUCTS")        
for key, val in doubt_product_count.most_common(10):
    print(val, key, sep="\t")
        
print("\nMOST COMMON TIMES")        
for key, val in doubt_time_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON WORK OF ARTS")        
for key, val in doubt_woa_count.most_common(10):
    print(val, key, sep="\t")

MOST COMMON CARDINALS
9	CARDINAL: one
7	CARDINAL: two
3	CARDINAL: four
2	CARDINAL: dozens
2	CARDINAL: 11
1	CARDINAL: 59
1	CARDINAL: over 500
1	CARDINAL: hundreds
1	CARDINAL: 32
1	CARDINAL: 911

MOST COMMON DATES
6	DATE: Friday
5	DATE: Thursday
4	DATE: Monday
4	DATE: 2016
3	DATE: today
2	DATE: Tuesday
2	DATE: Oct. 1
2	DATE: October
2	DATE: daily
1	DATE: Wednesday

MOST COMMON EVENTS
1	EVENT: Geneva -
1	EVENT: the Great Western Schism
1	EVENT: the Third Secret
1	EVENT: the Second Amendment

MOST COMMON FACILITIES
3	FAC: Metro
2	FAC: Route 91 Harvest
1	FAC: Noah Lew
1	FAC: Travis Air Force
1	FAC: Vatican

MOST COMMON COUNTRIES OR CITIES
12	GPE: Las Vegas
8	GPE: Iran
8	GPE: Syria
6	GPE: U.S.
5	GPE: Russia
4	GPE: Australia
3	GPE: Baghdad
3	GPE: Damascus
3	GPE: Obama
2	GPE: Israel
2	GPE: Msgr
2	GPE: Afghanistan
2	GPE: Vadim Mikerin
2	GPE: Rome
2	GPE: US

MOST COMMON LAWS
1	LAW: the Real Presence in the Mass - in the Sacrifice of the Mass
1	LAW: the Foreign Corrupt Practices Act
1	LAW: Articl

### Appeal_to_Fear-Prejudice

This first section will focus on the extraction of the information described above focusing on the Appeal_to_Fear-Prejudice class.

In [264]:
total_entities_dict_prejudice = {key: list(g) for key, g in groupby(sorted(total_prejudice_text.ents, key=lambda x: x.label_), lambda x: x.label_)}
print("ALL ENTITIES")
print("All categories extracted: ", total_entities_dict_prejudice.keys())
print("Number of categories extracted: ", len(total_entities_dict_prejudice))

total_entities_value_list_prejudice = list()
for i in total_entities_dict_prejudice.values():
    total_entities_value_list_prejudice.append(i)
    
total_entities_prejudice = len(sum(total_entities_value_list_prejudice, []))
print("Total number of entities: ", total_entities_prejudice)

unique_entities_dict_prejudice = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(total_prejudice_text.ents, key=lambda x: x.label_), lambda x: x.label_)}

unique_entities_value_list_prejudice = list()
for i in unique_entities_dict_prejudice.values():
    unique_entities_value_list_prejudice.append(i)
    
unique_entities_prejudice = len(sum(unique_entities_value_list_prejudice, []))
print("Total number of unique entities: ", unique_entities_prejudice)
print("Percentage of unique entities: ", (unique_entities_prejudice/total_entities_prejudice)*100)

ALL ENTITIES
All categories extracted:  dict_keys(['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'TIME', 'WORK_OF_ART'])
Number of categories extracted:  16
Total number of entities:  491
Total number of unique entities:  302
Percentage of unique entities:  61.5071283095723


In [265]:
# Counting the total occurrences for each label
total_cardinal_prejudice = len(total_entities_dict_prejudice['CARDINAL'])
total_date_prejudice = len(total_entities_dict_prejudice['DATE'])
total_event_prejudice = len(total_entities_dict_prejudice['EVENT'])
total_fac_prejudice = len(total_entities_dict_prejudice['FAC'])
total_gpe_prejudice = len(total_entities_dict_prejudice['GPE'])
total_law_prejudice = len(total_entities_dict_prejudice['LAW'])
total_loc_prejudice = len(total_entities_dict_prejudice['LOC'])
total_money_prejudice = len(total_entities_dict_prejudice['MONEY'])
total_norp_prejudice = len(total_entities_dict_prejudice['NORP'])
total_ordinal_prejudice = len(total_entities_dict_prejudice['ORDINAL'])
total_org_prejudice = len(total_entities_dict_prejudice['ORG'])
total_percent_prejudice = len(total_entities_dict_prejudice['PERCENT'])
total_person_prejudice = len(total_entities_dict_prejudice['PERSON'])
total_product_prejudice = len(total_entities_dict_prejudice['PRODUCT'])
total_time_prejudice = len(total_entities_dict_prejudice['TIME'])
total_woa_prejudice = len(total_entities_dict_prejudice['WORK_OF_ART'])

In [266]:
# Counting the unique occurrences for each label
unique_cardinal_prejudice = len(unique_entities_dict_prejudice['CARDINAL'])
unique_date_prejudice = len(unique_entities_dict_prejudice['DATE'])
unique_event_prejudice = len(unique_entities_dict_prejudice['EVENT'])
unique_fac_prejudice = len(unique_entities_dict_prejudice['FAC'])
unique_gpe_prejudice = len(unique_entities_dict_prejudice['GPE'])
unique_law_prejudice = len(unique_entities_dict_prejudice['LAW'])
unique_loc_prejudice = len(unique_entities_dict_prejudice['LOC'])
unique_money_prejudice = len(unique_entities_dict_prejudice['MONEY'])
unique_norp_prejudice = len(unique_entities_dict_prejudice['NORP'])
unique_ordinal_prejudice = len(unique_entities_dict_prejudice['ORDINAL'])
unique_org_prejudice = len(unique_entities_dict_prejudice['ORG'])
unique_percent_prejudice = len(unique_entities_dict_prejudice['PERCENT'])
unique_person_prejudice = len(unique_entities_dict_prejudice['PERSON'])
unique_product_prejudice = len(unique_entities_dict_prejudice['PRODUCT'])
unique_time_prejudice = len(unique_entities_dict_prejudice['TIME'])
unique_woa_prejudice = len(unique_entities_dict_prejudice['WORK_OF_ART'])

In [267]:
print("CARDINAL")
print("Total of extracted cardinal numbers: ", total_cardinal_prejudice)
print("Percentage of extracted cardinal numbers: ", (total_cardinal_prejudice/total_entities_prejudice)*100)
print("Total of unique cardinal numbers: ", unique_cardinal_prejudice)
print("Percentage of unique cardinal numbers: ", (unique_cardinal_prejudice/total_cardinal_prejudice)*100)
print("\nDATE")
print("Total of extracted dates: ", total_date_prejudice)
print("Percentage of extracted dates: ", (total_date_prejudice/total_entities_prejudice)*100)
print("Total of unique dates: ", unique_date_prejudice)
print("Percentage of unique dates: ", (unique_date_prejudice/total_date_prejudice)*100)
print("\nEVENT")
print("Total of extracted events: ", total_event_prejudice)
print("Percentage of extracted events: ", (total_event_prejudice/total_entities_prejudice)*100)
print("Total of unique events: ", unique_event_prejudice)
print("Percentage of unique events: ", (unique_event_prejudice/total_event_prejudice)*100)
print("\nFAC")
print("Total of extracted facilities: ", total_fac_prejudice)
print("Percentage of extracted facilities: ", (total_fac_prejudice/total_entities_prejudice)*100)
print("Total of unique facilities: ", unique_fac_prejudice)
print("Percentage of unique facilities: ", (unique_fac_prejudice/total_fac_prejudice)*100)
print("\nGPE")
print("Total of extracted countries or cities: ", total_gpe_prejudice)
print("Percentage of extracted countries or cities: ", (total_gpe_prejudice/total_entities_prejudice)*100)
print("Total of unique countries or cities: ", unique_gpe_prejudice)
print("Percentage of unique countries or cities: ", (unique_gpe_prejudice/total_gpe_prejudice)*100)
print("\nLAW")
print("Total of extracted laws: ", total_law_prejudice)
print("Percentage of extracted laws: ", (total_law_prejudice/total_entities_prejudice)*100)
print("Total of unique laws: ", unique_law_prejudice)
print("Percentage of unique laws: ", (unique_law_prejudice/total_law_prejudice)*100)
print("\nLOC")
print("Total of extracted generic locations: ", total_loc_prejudice)
print("Percentage of extracted generic locations: ", (total_loc_prejudice/total_entities_prejudice)*100)
print("Total of unique generic locations: ", unique_loc_prejudice)
print("Percentage of unique generic locations: ", (unique_loc_prejudice/total_loc_prejudice)*100)
print("\nMONEY")
print("Total of extracted money values: ", total_money_prejudice)
print("Percentage of extracted money values: ", (total_money_prejudice/total_entities_prejudice)*100)
print("Total of unique money values: ", unique_money_prejudice)
print("Percentage of unique money values: ", (unique_money_prejudice/total_money_prejudice)*100)
print("\nNORP")
print("Total of extracted nationalities, religious or political groups: ", total_norp_prejudice)
print("Percentage of extracted nationalities, religious or political groups: ", (total_norp_prejudice/total_entities_prejudice)*100)
print("Total of unique nationalities, religious or political groups: ", unique_norp_prejudice)
print("Percentage of unique nationalities, religious or political groups: ", (unique_norp_prejudice/total_norp_prejudice)*100)
print("\nORDINAL")
print("Total of extracted ordinal numbers: ", total_ordinal_prejudice)
print("Percentage extracted ordinal numbers: ", (total_ordinal_loaded/total_entities_prejudice)*100)
print("Total of unique ordinal numbers: ", unique_ordinal_prejudice)
print("Percentage of unique ordinal numbers: ", (unique_ordinal_prejudice/total_ordinal_prejudice)*100)
print("\nORG")
print("Total of extracted companies or organizations: ", total_org_prejudice)
print("Percentage extracted companies or organizations: ", (total_org_prejudice/total_entities_prejudice)*100)
print("Total of unique companies or organizations: ", unique_org_prejudice)
print("Percentage of unique companies or organizations: ", (unique_org_prejudice/total_org_prejudice)*100)
print("\nPERCENT")
print("Total of extracted percentages: ", total_percent_prejudice)
print("Percentage extracted percentages: ", (total_percent_prejudice/total_entities_prejudice)*100)
print("Total of unique percentages: ", unique_percent_prejudice)
print("Percentage of unique percentages: ", (unique_percent_prejudice/total_percent_prejudice)*100)
print("\nPERSON")
print("Total of extracted people: ", total_person_prejudice)
print("Percentage extracted people: ", (total_person_prejudice/total_entities_prejudice)*100)
print("Total of unique people: ", unique_person_prejudice)
print("Percentage of unique people: ", (unique_person_prejudice/total_person_prejudice)*100)
print("\nPRODUCT")
print("Total of extracted products: ", total_product_prejudice)
print("Percentage extracted products: ", (total_product_prejudice/total_entities_prejudice)*100)
print("Total of unique products: ", unique_product_prejudice)
print("Percentage of unique products: ", (unique_product_prejudice/total_product_prejudice)*100)
print("\nTIME")
print("Total of extracted times: ", total_time_prejudice)
print("Percentage of extracted times: ", (total_time_prejudice/total_entities_prejudice)*100)
print("Total of unique times: ", unique_time_prejudice)
print("Percentage of unique times: ", (unique_time_prejudice/total_time_prejudice)*100)
print("\nWORK OF ART")
print("Total of extracted works of art: ", total_woa_prejudice)
print("Percentage of extracted works of art: ", (total_woa_prejudice/total_entities_prejudice)*100)
print("Total of unique works of art: ", unique_woa_prejudice)
print("Percentage of unique works of art: ", (unique_woa_prejudice/total_woa_prejudice)*100)

CARDINAL
Total of extracted cardinal numbers:  28
Percentage of extracted cardinal numbers:  5.7026476578411405
Total of unique cardinal numbers:  18
Percentage of unique cardinal numbers:  64.28571428571429

DATE
Total of extracted dates:  43
Percentage of extracted dates:  8.757637474541752
Total of unique dates:  30
Percentage of unique dates:  69.76744186046511

EVENT
Total of extracted events:  3
Percentage of extracted events:  0.6109979633401221
Total of unique events:  3
Percentage of unique events:  100.0

FAC
Total of extracted facilities:  1
Percentage of extracted facilities:  0.20366598778004072
Total of unique facilities:  1
Percentage of unique facilities:  100.0

GPE
Total of extracted countries or cities:  135
Percentage of extracted countries or cities:  27.494908350305497
Total of unique countries or cities:  55
Percentage of unique countries or cities:  40.74074074074074

LAW
Total of extracted laws:  1
Percentage of extracted laws:  0.20366598778004072
Total of uni

In [268]:
prejudice_ents_count = Counter()

for ent in total_prejudice_text.ents:
    prejudice_ents_count[f"{ent.label_}: {ent.text}"] += 1

print("MOST COMMON ENTITIES")
for key, val in prejudice_ents_count.most_common(30):
    print(val, key, sep="\t")

MOST COMMON ENTITIES
22	GPE: Iran
13	GPE: U.S.
10	GPE: Syria
8	ORG: Church
8	GPE: the United States
7	PERSON: Trump
7	CARDINAL: one
6	GPE: Iraq
6	PERSON: Donald Trump
6	GPE: UK
5	NORP: Muslim
5	GPE: US
5	ORG: Trump
4	ORG: ISIS
4	DATE: Tuesday
4	NORP: Syrian
4	ORG: Hezbollah
4	LOC: Africa
4	DATE: May
4	LOC: Europe
4	NORP: Islamic
4	NORP: Muslims
4	GPE: Khashoggi
3	GPE: Madagascar
3	NORP: Iranian
3	NORP: British
3	NORP: German
3	ORDINAL: first
3	PERSON: Bagheri
3	NORP: African


In [269]:
prejudice_cardinal_count = Counter()
prejudice_date_count = Counter()
prejudice_event_count = Counter()
prejudice_fac_count = Counter()
prejudice_gpe_count = Counter()
prejudice_law_count = Counter()
prejudice_loc_count = Counter()
prejudice_money_count = Counter()
prejudice_norp_count = Counter()
prejudice_ordinal_count = Counter()
prejudice_org_count = Counter()
prejudice_percent_count = Counter()
prejudice_person_count = Counter()
prejudice_product_count = Counter()
prejudice_time_count = Counter()
prejudice_woa_count = Counter()

for ent in total_prejudice_text.ents:
    if (ent.label_ == "CARDINAL"):
        prejudice_cardinal_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "DATE"):
        prejudice_date_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "EVENT"):
        prejudice_event_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "FAC"):
        prejudice_fac_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "GPE"):
        prejudice_gpe_count[f"{ent.label_}: {ent.text}"] += 1  
    if (ent.label_ == "LAW"):
        prejudice_law_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LOC"):
        prejudice_loc_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "MONEY"):
        prejudice_money_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "NORP"):
        prejudice_norp_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORDINAL"):
        prejudice_ordinal_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORG"):
        prejudice_org_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PERCENT"):
        prejudice_percent_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "PERSON"):
        prejudice_person_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PRODUCT"):
        prejudice_product_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "TIME"):
        prejudice_time_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "WORK_OF_ART"):
        prejudice_woa_count[f"{ent.label_}: {ent.text}"] += 1 

In [270]:
print("MOST COMMON CARDINALS")        
for key, val in prejudice_cardinal_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON DATES")        
for key, val in prejudice_date_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON EVENTS")        
for key, val in prejudice_event_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON FACILITIES")        
for key, val in prejudice_fac_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON COUNTRIES OR CITIES")        
for key, val in prejudice_gpe_count.most_common(15):
    print(val, key, sep="\t") 

print("\nMOST COMMON LAWS")        
for key, val in prejudice_law_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LOCATIONS")        
for key, val in prejudice_loc_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON MONEY VALUES")        
for key, val in prejudice_money_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON NATIONALITIES OR GROUPS")        
for key, val in prejudice_norp_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON ORDINALS")        
for key, val in prejudice_ordinal_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON ORGANIZATIONS")        
for key, val in prejudice_org_count.most_common(20):
    print(val, key, sep="\t")  

print("\nMOST COMMON PERCENTAGES")        
for key, val in prejudice_percent_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON PEOPLE")        
for key, val in prejudice_person_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON PRODUCTS")        
for key, val in prejudice_product_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON TIMES")        
for key, val in prejudice_time_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON WORK OF ARTS")        
for key, val in prejudice_woa_count.most_common(10):
    print(val, key, sep="\t")

MOST COMMON CARDINALS
7	CARDINAL: one
2	CARDINAL: five
2	CARDINAL: more than 1,700
2	CARDINAL: 240,255
2	CARDINAL: two
1	CARDINAL: Ten
1	CARDINAL: half
1	CARDINAL: roughly 50
1	CARDINAL: 3
1	CARDINAL: 6

MOST COMMON DATES
4	DATE: Tuesday
4	DATE: May
3	DATE: 2013
3	DATE: Days later
2	DATE: Oct. 1, 2009
2	DATE: fiscal year 2016
2	DATE: Friday
1	DATE: last Friday
1	DATE: 13 years
1	DATE: this past week

MOST COMMON EVENTS
1	EVENT: Occupation
1	EVENT: the Declaration of Religious Freedom
1	EVENT: the World Economic Forum

MOST COMMON FACILITIES
1	FAC: Vatican II

MOST COMMON COUNTRIES OR CITIES
22	GPE: Iran
13	GPE: U.S.
10	GPE: Syria
8	GPE: the United States
6	GPE: Iraq
6	GPE: UK
5	GPE: US
4	GPE: Khashoggi
3	GPE: Madagascar
3	GPE: Switzerland
2	GPE: America
2	GPE: IRGC
2	GPE: Damascus
2	GPE: Kirkuk
2	GPE: North Korea

MOST COMMON LAWS
1	LAW: Constitution

MOST COMMON LOCATIONS
4	LOC: Africa
4	LOC: Europe
2	LOC: Western Europe
1	LOC: Straits
1	LOC: the Middle East
1	LOC: Central Europe
1	LO

### Exaggeration-Minimisation

This first section will focus on the extraction of the information described above focusing on the Exaggeration-Minimisation class.

In [271]:
total_entities_dict_ex_min = {key: list(g) for key, g in groupby(sorted(total_ex_min_text.ents, key=lambda x: x.label_), lambda x: x.label_)}
print("ALL ENTITIES")
print("All categories extracted: ", total_entities_dict_ex_min.keys())
print("Number of categories extracted: ", len(total_entities_dict_ex_min))

total_entities_value_list_ex_min = list()
for i in total_entities_dict_ex_min.values():
    total_entities_value_list_ex_min.append(i)
    
total_entities_ex_min= len(sum(total_entities_value_list_ex_min, []))
print("Total number of entities: ", total_entities_ex_min)

unique_entities_dict_ex_min = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(total_ex_min_text.ents, key=lambda x: x.label_), lambda x: x.label_)}

unique_entities_value_list_ex_min = list()
for i in unique_entities_dict_ex_min.values():
    unique_entities_value_list_ex_min.append(i)
    
unique_entities_ex_min= len(sum(unique_entities_value_list_ex_min, []))
print("Total number of unique entities: ", unique_entities_ex_min)
print("Percentage of unique entities: ", (unique_entities_ex_min/total_entities_ex_min)*100)

ALL ENTITIES
All categories extracted:  dict_keys(['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LAW', 'LOC', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART'])
Number of categories extracted:  16
Total number of entities:  429
Total number of unique entities:  299
Percentage of unique entities:  69.6969696969697


In [272]:
# Counting the total occurrences for each label
total_cardinal_ex_min = len(total_entities_dict_ex_min['CARDINAL'])
total_date_ex_min = len(total_entities_dict_ex_min['DATE'])
total_event_ex_min = len(total_entities_dict_ex_min['EVENT'])
total_fac_ex_min = len(total_entities_dict_ex_min['FAC'])
total_gpe_ex_min = len(total_entities_dict_ex_min['GPE'])
total_law_ex_min = len(total_entities_dict_ex_min['LAW'])
total_loc_ex_min = len(total_entities_dict_ex_min['LOC'])
total_norp_ex_min = len(total_entities_dict_ex_min['NORP'])
total_ordinal_ex_min = len(total_entities_dict_ex_min['ORDINAL'])
total_org_ex_min = len(total_entities_dict_ex_min['ORG'])
total_percent_ex_min = len(total_entities_dict_ex_min['PERCENT'])
total_person_ex_min = len(total_entities_dict_ex_min['PERSON'])
total_product_ex_min = len(total_entities_dict_ex_min['PRODUCT'])
total_quantity_ex_min = len(total_entities_dict_ex_min['QUANTITY'])
total_time_ex_min = len(total_entities_dict_ex_min['TIME'])
total_woa_ex_min = len(total_entities_dict_ex_min['WORK_OF_ART'])

In [273]:
# Counting the unique occurrences for each label
unique_cardinal_ex_min = len(unique_entities_dict_ex_min['CARDINAL'])
unique_date_ex_min = len(unique_entities_dict_ex_min['DATE'])
unique_event_ex_min = len(unique_entities_dict_ex_min['EVENT'])
unique_fac_ex_min = len(unique_entities_dict_ex_min['FAC'])
unique_gpe_ex_min = len(unique_entities_dict_ex_min['GPE'])
unique_law_ex_min = len(unique_entities_dict_ex_min['LAW'])
unique_loc_ex_min = len(unique_entities_dict_ex_min['LOC'])
unique_norp_ex_min = len(unique_entities_dict_ex_min['NORP'])
unique_ordinal_ex_min = len(unique_entities_dict_ex_min['ORDINAL'])
unique_org_ex_min = len(unique_entities_dict_ex_min['ORG'])
unique_percent_ex_min = len(unique_entities_dict_ex_min['PERCENT'])
unique_person_ex_min = len(unique_entities_dict_ex_min['PERSON'])
unique_product_ex_min = len(unique_entities_dict_ex_min['PRODUCT'])
unique_quantity_ex_min = len(unique_entities_dict_ex_min['QUANTITY'])
unique_time_ex_min = len(unique_entities_dict_ex_min['TIME'])
unique_woa_ex_min = len(unique_entities_dict_ex_min['WORK_OF_ART'])

In [274]:
print("CARDINAL")
print("Total of extracted cardinal numbers: ", total_cardinal_ex_min)
print("Percentage of extracted cardinal numbers: ", (total_cardinal_ex_min/total_entities_ex_min)*100)
print("Total of unique cardinal numbers: ", unique_cardinal_ex_min)
print("Percentage of unique cardinal numbers: ", (unique_cardinal_ex_min/total_cardinal_ex_min)*100)
print("\nDATE")
print("Total of extracted dates: ", total_date_ex_min)
print("Percentage of extracted dates: ", (total_date_ex_min/total_entities_ex_min)*100)
print("Total of unique dates: ", unique_date_ex_min)
print("Percentage of unique dates: ", (unique_date_ex_min/total_date_ex_min)*100)
print("\nEVENT")
print("Total of extracted events: ", total_event_ex_min)
print("Percentage of extracted events: ", (total_event_ex_min/total_entities_ex_min)*100)
print("Total of unique events: ", unique_event_ex_min)
print("Percentage of unique events: ", (unique_event_ex_min/total_event_ex_min)*100)
print("\nFAC")
print("Total of extracted facilities: ", total_fac_ex_min)
print("Percentage of extracted facilities: ", (total_fac_ex_min/total_entities_ex_min)*100)
print("Total of unique facilities: ", unique_fac_ex_min)
print("Percentage of unique facilities: ", (unique_fac_ex_min/total_fac_ex_min)*100)
print("\nGPE")
print("Total of extracted countries or cities: ", total_gpe_ex_min)
print("Percentage of extracted countries or cities: ", (total_gpe_ex_min/total_entities_ex_min)*100)
print("Total of unique countries or cities: ", unique_gpe_ex_min)
print("Percentage of unique countries or cities: ", (unique_gpe_ex_min/total_gpe_ex_min)*100)
print("\nLAW")
print("Total of extracted laws: ", total_law_ex_min)
print("Percentage of extracted laws: ", (total_law_ex_min/total_entities_ex_min)*100)
print("Total of unique laws: ", unique_law_ex_min)
print("Percentage of unique laws: ", (unique_law_ex_min/total_law_ex_min)*100)
print("\nLOC")
print("Total of extracted generic locations: ", total_loc_ex_min)
print("Percentage of extracted generic locations: ", (total_loc_ex_min/total_entities_ex_min)*100)
print("Total of unique generic locations: ", unique_loc_ex_min)
print("Percentage of unique generic locations: ", (unique_loc_ex_min/total_loc_ex_min)*100)
print("\nNORP")
print("Total of extracted nationalities, religious or political groups: ", total_norp_ex_min)
print("Percentage of extracted nationalities, religious or political groups: ", (total_norp_ex_min/total_entities_ex_min)*100)
print("Total of unique nationalities, religious or political groups: ", unique_norp_ex_min)
print("Percentage of unique nationalities, religious or political groups: ", (unique_norp_ex_min/total_norp_ex_min)*100)
print("\nORDINAL")
print("Total of extracted ordinal numbers: ", total_ordinal_ex_min)
print("Percentage extracted ordinal numbers: ", (total_ordinal_loaded/total_entities_ex_min)*100)
print("Total of unique ordinal numbers: ", unique_ordinal_ex_min)
print("Percentage of unique ordinal numbers: ", (unique_ordinal_ex_min/total_ordinal_ex_min)*100)
print("\nORG")
print("Total of extracted companies or organizations: ", total_org_ex_min)
print("Percentage extracted companies or organizations: ", (total_org_ex_min/total_entities_ex_min)*100)
print("Total of unique companies or organizations: ", unique_org_ex_min)
print("Percentage of unique companies or organizations: ", (unique_org_ex_min/total_org_ex_min)*100)
print("\nPERCENT")
print("Total of extracted percentages: ", total_percent_ex_min)
print("Percentage extracted percentages: ", (total_percent_ex_min/total_entities_ex_min)*100)
print("Total of unique percentages: ", unique_percent_ex_min)
print("Percentage of unique percentages: ", (unique_percent_ex_min/total_percent_ex_min)*100)
print("\nPERSON")
print("Total of extracted people: ", total_person_ex_min)
print("Percentage extracted people: ", (total_person_ex_min/total_entities_ex_min)*100)
print("Total of unique people: ", unique_person_ex_min)
print("Percentage of unique people: ", (unique_person_ex_min/total_person_ex_min)*100)
print("\nPRODUCT")
print("Total of extracted products: ", total_product_ex_min)
print("Percentage extracted products: ", (total_product_ex_min/total_entities_ex_min)*100)
print("Total of unique products: ", unique_product_ex_min)
print("Percentage of unique products: ", (unique_product_ex_min/total_product_ex_min)*100)
print("\nQUANTITY")
print("Total of extracted measurements: ", total_quantity_ex_min)
print("Percentage extracted measurements: ", (total_quantity_ex_min/total_entities_ex_min)*100)
print("Total of unique measurements: ", unique_quantity_ex_min)
print("Percentage of unique measurements: ", (unique_quantity_ex_min/total_quantity_ex_min)*100)
print("\nTIME")
print("Total of extracted times: ", total_time_ex_min)
print("Percentage of extracted times: ", (total_time_ex_min/total_entities_ex_min)*100)
print("Total of unique times: ", unique_time_ex_min)
print("Percentage of unique times: ", (unique_time_ex_min/total_time_ex_min)*100)
print("\nWORK OF ART")
print("Total of extracted works of art: ", total_woa_ex_min)
print("Percentage of extracted works of art: ", (total_woa_ex_min/total_entities_ex_min)*100)
print("Total of unique works of art: ", unique_woa_ex_min)
print("Percentage of unique works of art: ", (unique_woa_ex_min/total_woa_ex_min)*100)

CARDINAL
Total of extracted cardinal numbers:  22
Percentage of extracted cardinal numbers:  5.128205128205128
Total of unique cardinal numbers:  13
Percentage of unique cardinal numbers:  59.09090909090909

DATE
Total of extracted dates:  50
Percentage of extracted dates:  11.655011655011654
Total of unique dates:  43
Percentage of unique dates:  86.0

EVENT
Total of extracted events:  4
Percentage of extracted events:  0.9324009324009324
Total of unique events:  4
Percentage of unique events:  100.0

FAC
Total of extracted facilities:  3
Percentage of extracted facilities:  0.6993006993006993
Total of unique facilities:  3
Percentage of unique facilities:  100.0

GPE
Total of extracted countries or cities:  83
Percentage of extracted countries or cities:  19.34731934731935
Total of unique countries or cities:  45
Percentage of unique countries or cities:  54.21686746987952

LAW
Total of extracted laws:  2
Percentage of extracted laws:  0.4662004662004662
Total of unique laws:  2
Perc

In [275]:
ex_min_ents_count = Counter()

for ent in total_ex_min_text.ents:
    ex_min_ents_count[f"{ent.label_}: {ent.text}"] += 1

print("MOST COMMON ENTITIES")
for key, val in ex_min_ents_count.most_common(30):
    print(val, key, sep="\t")

MOST COMMON ENTITIES
15	GPE: U.S.
9	CARDINAL: one
7	ORG: Guardian
7	PERSON: Trump
7	GPE: Iran
6	ORG: Trump
6	ORG: Assange
6	PERSON: Patel
5	GPE: the United States
5	NORP: American
4	ORDINAL: first
4	GPE: US
4	ORG: CIA
3	GPE: Vietnam
3	ORG: Congress
3	PERSON: Awan
3	GPE: Spain
3	PERSON: Obama
3	ORG: Julian Assange
3	LOC: the Gulf of Tonkin
3	GPE: Russia
3	PERCENT: 100%
3	ORG: Patel
2	ORG: Manafort
2	ORG: The Daily Caller
2	PERSON: Farrakhan
2	GPE: Obama
2	PERSON: Oswald
2	NORP: Marines
2	CARDINAL: millions


In [276]:
ex_min_cardinal_count = Counter()
ex_min_date_count = Counter()
ex_min_event_count = Counter()
ex_min_fac_count = Counter()
ex_min_gpe_count = Counter()
ex_min_law_count = Counter()
ex_min_loc_count = Counter()
ex_min_norp_count = Counter()
ex_min_ordinal_count = Counter()
ex_min_org_count = Counter()
ex_min_percent_count = Counter()
ex_min_person_count = Counter()
ex_min_product_count = Counter()
ex_min_quantity_count = Counter()
ex_min_time_count = Counter()
ex_min_woa_count = Counter()

for ent in total_ex_min_text.ents:
    if (ent.label_ == "CARDINAL"):
        ex_min_cardinal_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "DATE"):
        ex_min_date_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "EVENT"):
        ex_min_event_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "FAC"):
        ex_min_fac_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "GPE"):
        ex_min_gpe_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LAW"):
        ex_min_law_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LOC"):
        ex_min_loc_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "NORP"):
        ex_min_norp_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORDINAL"):
        ex_min_ordinal_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORG"):
        ex_min_org_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PERCENT"):
        ex_min_percent_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "PERSON"):
        ex_min_person_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PRODUCT"):
        ex_min_product_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "QUANTITY"):
        ex_min_quantity_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "TIME"):
        ex_min_time_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "WORK_OF_ART"):
        ex_min_woa_count[f"{ent.label_}: {ent.text}"] += 1 

In [277]:
print("MOST COMMON CARDINALS")        
for key, val in ex_min_cardinal_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON DATES")        
for key, val in ex_min_date_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON EVENTS")        
for key, val in ex_min_event_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON FACILITIES")        
for key, val in ex_min_fac_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON COUNTRIES OR CITIES")        
for key, val in ex_min_gpe_count.most_common(15):
    print(val, key, sep="\t")    

print("\nMOST COMMON LAWS")        
for key, val in ex_min_law_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LOCATIONS")        
for key, val in ex_min_loc_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON NATIONALITIES OR GROUPS")        
for key, val in ex_min_norp_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON ORDINALS")        
for key, val in ex_min_ordinal_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON ORGANIZATIONS")        
for key, val in ex_min_org_count.most_common(20):
    print(val, key, sep="\t")  

print("\nMOST COMMON PERCENTAGES")        
for key, val in ex_min_percent_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON PEOPLE")        
for key, val in ex_min_person_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON PRODUCTS")        
for key, val in ex_min_product_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON QUANTITIES")        
for key, val in ex_min_quantity_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON TIMES")        
for key, val in ex_min_time_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON WORK OF ARTS")        
for key, val in ex_min_woa_count.most_common(10):
    print(val, key, sep="\t")

MOST COMMON CARDINALS
9	CARDINAL: one
2	CARDINAL: millions
1	CARDINAL: 2006.Needless
1	CARDINAL: five
1	CARDINAL: seven
1	CARDINAL: two
1	CARDINAL: two-million-plus
1	CARDINAL: three
1	CARDINAL: 3
1	CARDINAL: tens of thousands

MOST COMMON DATES
2	DATE: Friday
2	DATE: Wednesday
2	DATE: July 2015
2	DATE: 2010
2	DATE: 2015
2	DATE: Saturday
2	DATE: 69
1	DATE: days
1	DATE: weeks
1	DATE: 50-year-old

MOST COMMON EVENTS
1	EVENT: the Korean War
1	EVENT: New Year's
1	EVENT: the Kuiper Belt
1	EVENT: the Cold War

MOST COMMON FACILITIES
1	FAC: Tower Hamlets
1	FAC: Vatican II
1	FAC: Epstein

MOST COMMON COUNTRIES OR CITIES
15	GPE: U.S.
7	GPE: Iran
5	GPE: the United States
4	GPE: US
3	GPE: Vietnam
3	GPE: Spain
3	GPE: Russia
2	GPE: Obama
2	GPE: Arizona
2	GPE: Brazil
2	GPE: Germany
2	GPE: New York City
1	GPE: Florida
1	GPE: Kirkuk
1	GPE: Korea

MOST COMMON LAWS
1	LAW: First Amendment
1	LAW: the First Amendment

MOST COMMON LOCATIONS
3	LOC: the Gulf of Tonkin
2	LOC: Europe
1	LOC: Barros
1	LOC: Earth


### Flag_Waving

This first section will focus on the extraction of the information described above focusing on the Flag_Waving class.

In [278]:
total_entities_dict_waving = {key: list(g) for key, g in groupby(sorted(total_waving_text.ents, key=lambda x: x.label_), lambda x: x.label_)}
print("ALL ENTITIES")
print("All categories extracted: ", total_entities_dict_waving.keys())
print("Number of categories extracted: ", len(total_entities_dict_waving))

total_entities_value_list_waving = list()
for i in total_entities_dict_waving.values():
    total_entities_value_list_waving.append(i)
    
total_entities_waving= len(sum(total_entities_value_list_waving, []))
print("Total number of entities: ", total_entities_waving)

unique_entities_dict_waving = {key: list(set(map(lambda x: str(x), g))) for key, g in groupby(sorted(total_waving_text.ents, key=lambda x: x.label_), lambda x: x.label_)}

unique_entities_value_list_waving = list()
for i in unique_entities_dict_waving.values():
    unique_entities_value_list_waving.append(i)
    
unique_entities_waving= len(sum(unique_entities_value_list_waving, []))
print("Total number of unique entities: ", unique_entities_waving)
print("Percentage of unique entities: ", (unique_entities_waving/total_entities_waving)*100)

ALL ENTITIES
All categories extracted:  dict_keys(['CARDINAL', 'DATE', 'GPE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'TIME', 'WORK_OF_ART'])
Number of categories extracted:  14
Total number of entities:  529
Total number of unique entities:  300
Percentage of unique entities:  56.71077504725898


In [279]:
# Counting the total occurrences for each label
total_cardinal_waving = len(total_entities_dict_waving['CARDINAL'])
total_date_waving = len(total_entities_dict_waving['DATE'])
total_gpe_waving = len(total_entities_dict_waving['GPE'])
total_law_waving = len(total_entities_dict_waving['LAW'])
total_loc_waving = len(total_entities_dict_waving['LOC'])
total_money_waving = len(total_entities_dict_waving['MONEY'])
total_norp_waving = len(total_entities_dict_waving['NORP'])
total_ordinal_waving = len(total_entities_dict_waving['ORDINAL'])
total_org_waving = len(total_entities_dict_waving['ORG'])
total_percent_waving = len(total_entities_dict_waving['PERCENT'])
total_person_waving = len(total_entities_dict_waving['PERSON'])
total_product_waving = len(total_entities_dict_waving['PRODUCT'])
total_time_waving = len(total_entities_dict_waving['TIME'])
total_woa_waving = len(total_entities_dict_waving['WORK_OF_ART'])

In [280]:
# Counting the unique occurrences for each label
unique_cardinal_waving = len(unique_entities_dict_waving['CARDINAL'])
unique_date_waving = len(unique_entities_dict_waving['DATE'])
unique_gpe_waving = len(unique_entities_dict_waving['GPE'])
unique_law_waving = len(unique_entities_dict_waving['LAW'])
unique_loc_waving = len(unique_entities_dict_waving['LOC'])
unique_money_waving = len(unique_entities_dict_waving['MONEY'])
unique_norp_waving = len(unique_entities_dict_waving['NORP'])
unique_ordinal_waving = len(unique_entities_dict_waving['ORDINAL'])
unique_org_waving = len(unique_entities_dict_waving['ORG'])
unique_percent_waving = len(unique_entities_dict_waving['PERCENT'])
unique_person_waving = len(unique_entities_dict_waving['PERSON'])
unique_product_waving = len(unique_entities_dict_waving['PRODUCT'])
unique_time_waving = len(unique_entities_dict_waving['TIME'])
unique_woa_waving = len(unique_entities_dict_waving['WORK_OF_ART'])

In [281]:
print("CARDINAL")
print("Total of extracted cardinal numbers: ", total_cardinal_waving)
print("Percentage of extracted cardinal numbers: ", (total_cardinal_waving/total_entities_waving)*100)
print("Total of unique cardinal numbers: ", unique_cardinal_waving)
print("Percentage of unique cardinal numbers: ", (unique_cardinal_waving/total_cardinal_waving)*100)
print("\nDATE")
print("Total of extracted dates: ", total_date_waving)
print("Percentage of extracted dates: ", (total_date_waving/total_entities_waving)*100)
print("Total of unique dates: ", unique_date_waving)
print("Percentage of unique dates: ", (unique_date_waving/total_date_waving)*100)
print("\nGPE")
print("Total of extracted countries or cities: ", total_gpe_waving)
print("Percentage of extracted countries or cities: ", (total_gpe_waving/total_entities_waving)*100)
print("Total of unique countries or cities: ", unique_gpe_waving)
print("Percentage of unique countries or cities: ", (unique_gpe_waving/total_gpe_waving)*100)
print("\nLAW")
print("Total of extracted laws: ", total_law_waving)
print("Percentage of extracted laws: ", (total_law_waving/total_entities_waving)*100)
print("Total of unique laws: ", unique_law_waving)
print("Percentage of unique laws: ", (unique_law_waving/total_law_waving)*100)
print("\nLOC")
print("Total of extracted generic locations: ", total_loc_waving)
print("Percentage of extracted generic locations: ", (total_loc_waving/total_entities_waving)*100)
print("Total of unique generic locations: ", unique_loc_waving)
print("Percentage of unique generic locations: ", (unique_loc_waving/total_loc_waving)*100)
print("\nMONEY")
print("Total of extracted money values: ", total_money_waving)
print("Percentage of extracted money values: ", (total_money_waving/total_entities_waving)*100)
print("Total of unique money values: ", unique_money_waving)
print("Percentage of unique money values: ", (unique_money_waving/total_money_waving)*100)
print("\nNORP")
print("Total of extracted nationalities, religious or political groups: ", total_norp_waving)
print("Percentage of extracted nationalities, religious or political groups: ", (total_norp_waving/total_entities_waving)*100)
print("Total of unique nationalities, religious or political groups: ", unique_norp_waving)
print("Percentage of unique nationalities, religious or political groups: ", (unique_norp_waving/total_norp_waving)*100)
print("\nORDINAL")
print("Total of extracted ordinal numbers: ", total_ordinal_waving)
print("Percentage extracted ordinal numbers: ", (total_ordinal_loaded/total_entities_waving)*100)
print("Total of unique ordinal numbers: ", unique_ordinal_waving)
print("Percentage of unique ordinal numbers: ", (unique_ordinal_waving/total_ordinal_waving)*100)
print("\nORG")
print("Total of extracted companies or organizations: ", total_org_waving)
print("Percentage extracted companies or organizations: ", (total_org_waving/total_entities_waving)*100)
print("Total of unique companies or organizations: ", unique_org_waving)
print("Percentage of unique companies or organizations: ", (unique_org_waving/total_org_waving)*100)
print("\nPERCENT")
print("Total of extracted percentages: ", total_percent_waving)
print("Percentage extracted percentages: ", (total_percent_waving/total_entities_waving)*100)
print("Total of unique percentages: ", unique_percent_waving)
print("Percentage of unique percentages: ", (unique_percent_waving/total_percent_waving)*100)
print("\nPERSON")
print("Total of extracted people: ", total_person_waving)
print("Percentage extracted people: ", (total_person_waving/total_entities_waving)*100)
print("Total of unique people: ", unique_person_waving)
print("Percentage of unique people: ", (unique_person_waving/total_person_waving)*100)
print("\nPRODUCT")
print("Total of extracted products: ", total_product_waving)
print("Percentage extracted products: ", (total_product_waving/total_entities_waving)*100)
print("Total of unique products: ", unique_product_waving)
print("Percentage of unique products: ", (unique_product_waving/total_product_waving)*100)
print("\nTIME")
print("Total of extracted times: ", total_time_waving)
print("Percentage of extracted times: ", (total_time_waving/total_entities_waving)*100)
print("Total of unique times: ", unique_time_waving)
print("Percentage of unique times: ", (unique_time_waving/total_time_waving)*100)
print("\nWORK OF ART")
print("Total of extracted works of art: ", total_woa_waving)
print("Percentage of extracted works of art: ", (total_woa_waving/total_entities_waving)*100)
print("Total of unique works of art: ", unique_woa_waving)
print("Percentage of unique works of art: ", (unique_woa_waving/total_woa_waving)*100)

CARDINAL
Total of extracted cardinal numbers:  23
Percentage of extracted cardinal numbers:  4.3478260869565215
Total of unique cardinal numbers:  15
Percentage of unique cardinal numbers:  65.21739130434783

DATE
Total of extracted dates:  58
Percentage of extracted dates:  10.964083175803403
Total of unique dates:  47
Percentage of unique dates:  81.03448275862068

GPE
Total of extracted countries or cities:  105
Percentage of extracted countries or cities:  19.848771266540645
Total of unique countries or cities:  51
Percentage of unique countries or cities:  48.57142857142857

LAW
Total of extracted laws:  6
Percentage of extracted laws:  1.1342155009451798
Total of unique laws:  4
Percentage of unique laws:  66.66666666666666

LOC
Total of extracted generic locations:  18
Percentage of extracted generic locations:  3.402646502835539
Total of unique generic locations:  8
Percentage of unique generic locations:  44.44444444444444

MONEY
Total of extracted money values:  4
Percentage 

In [282]:
waving_ents_count = Counter()

for ent in total_waving_text.ents:
    waving_ents_count[f"{ent.label_}: {ent.text}"] += 1

print("MOST COMMON ENTITIES")
for key, val in waving_ents_count.most_common(30):
    print(val, key, sep="\t")

MOST COMMON ENTITIES
42	NORP: American
13	GPE: Iran
10	GPE: America
9	ORG: CIA
9	NORP: Americans
8	LOC: Europe
7	ORG: Congress
7	CARDINAL: two
7	PERSON: Habib Powell
6	GPE: US
5	ORG: Hezbollah
5	GPE: Syria
5	PERSON: Trump
5	GPE: U.S.
5	NORP: Christian
5	NORP: Democrats
4	GPE: the United States
4	ORG: Trump
4	NORP: Islamic
4	NORP: Russian
4	DATE: Sunday
4	ORDINAL: first
4	ORG: Orban
4	DATE: today
4	GPE: Russia
4	PERSON: Barr
4	ORG: CNN
3	GPE: Lebanon
3	DATE: 1953
3	NORP: Muslim


In [283]:
waving_cardinal_count = Counter()
waving_date_count = Counter()
waving_gpe_count = Counter()
waving_law_count = Counter()
waving_loc_count = Counter()
waving_money_count = Counter()
waving_norp_count = Counter()
waving_ordinal_count = Counter()
waving_org_count = Counter()
waving_percent_count = Counter()
waving_person_count = Counter()
waving_product_count = Counter()
waving_time_count = Counter()
waving_woa_count = Counter()

for ent in total_waving_text.ents:
    if (ent.label_ == "CARDINAL"):
        waving_cardinal_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "DATE"):
        waving_date_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "GPE"):
        waving_gpe_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LAW"):
        waving_law_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "LOC"):
        waving_loc_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "MONEY"):
        waving_money_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "NORP"):
        waving_norp_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORDINAL"):
        waving_ordinal_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "ORG"):
        waving_org_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PERCENT"):
        waving_percent_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "PERSON"):
        waving_person_count[f"{ent.label_}: {ent.text}"] += 1
    if (ent.label_ == "PRODUCT"):
        waving_product_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "TIME"):
        waving_time_count[f"{ent.label_}: {ent.text}"] += 1 
    if (ent.label_ == "WORK_OF_ART"):
        waving_woa_count[f"{ent.label_}: {ent.text}"] += 1 

In [284]:
print("MOST COMMON CARDINALS")        
for key, val in waving_cardinal_count.most_common(10):
    print(val, key, sep="\t")
    
print("\nMOST COMMON DATES")        
for key, val in waving_date_count.most_common(10):
    print(val, key, sep="\t") 

print("\nMOST COMMON COUNTRIES OR CITIES")        
for key, val in waving_gpe_count.most_common(15):
    print(val, key, sep="\t") 

print("\nMOST COMMON LAWS")        
for key, val in waving_law_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON LOCATIONS")        
for key, val in waving_loc_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON MONEY VALUES")        
for key, val in waving_money_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON NATIONALITIES OR GROUPS")        
for key, val in waving_norp_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON ORDINALS")        
for key, val in waving_ordinal_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON ORGANIZATIONS")        
for key, val in waving_org_count.most_common(20):
    print(val, key, sep="\t")  

print("\nMOST COMMON PERCENTAGES")        
for key, val in waving_percent_count.most_common(10):
    print(val, key, sep="\t")  

print("\nMOST COMMON PEOPLE")        
for key, val in waving_person_count.most_common(20):
    print(val, key, sep="\t")
    
print("\nMOST COMMON PRODUCTS")        
for key, val in waving_product_count.most_common(10):
    print(val, key, sep="\t")

print("\nMOST COMMON TIMES")        
for key, val in waving_time_count.most_common(10):
    print(val, key, sep="\t")  
    
print("\nMOST COMMON WORK OF ARTS")        
for key, val in waving_woa_count.most_common(10):
    print(val, key, sep="\t")

MOST COMMON CARDINALS
7	CARDINAL: two
2	CARDINAL: One
2	CARDINAL: three
1	CARDINAL: only one
1	CARDINAL: 5x
1	CARDINAL: 2017For
1	CARDINAL: well as thousands
1	CARDINAL: Two-thirds
1	CARDINAL: 0.7
1	CARDINAL: 0.1

MOST COMMON DATES
4	DATE: Sunday
4	DATE: today
3	DATE: 1953
2	DATE: February 11, 2018
2	DATE: 2015
2	DATE: the coming days
1	DATE: so many years
1	DATE: more than 50 years
1	DATE: November 1963
1	DATE: 1954

MOST COMMON COUNTRIES OR CITIES
13	GPE: Iran
10	GPE: America
6	GPE: US
5	GPE: Syria
5	GPE: U.S.
4	GPE: the United States
4	GPE: Russia
3	GPE: Lebanon
3	GPE: Israel
3	GPE: Florida
3	GPE: Broward County
2	GPE: UK
2	GPE: Brussels
2	GPE: Hungary
2	GPE: West Virginia

MOST COMMON LAWS
3	LAW: Constitution
1	LAW: Second Amendment
1	LAW: the US Constitution
1	LAW: First Amendment rights

MOST COMMON LOCATIONS
8	LOC: Europe
2	LOC: the Middle East
2	LOC: Africa
2	LOC: West
1	LOC: East
1	LOC: the Middle East Media Research Institute
1	LOC: Sunna
1	LOC: Central America

MOST COMMON M

-----------------------------------------------------------------------------------------------------------

# Sentiment Analysis

For each paragraph belonging to a particular persuasion technique, it was decided to calculate the mean of the positive and negative sentiment for each paragraph in order to provide a general overview of the sentiment associated with that particular persuasion technique. 

This first process will involve all the persuasion techiniques combined and label-specific analysis will follow.

In [285]:
doc_total_lista = list()
for i in X_total_lista:
    doc_total_lista.append(nlp(i))

In [286]:
#Load spacy sentiment analysis
nlp_sa = eng_spacysentiment.load()

# Perform spacy sentiment analysis for each paragraph.
doc_total_sentiment = list()
for i in doc_total_lista:
    doc_total_sentiment.append(nlp_sa(i))
    
# Create an empty list to which I append, for each paragraph, a dictionary containing the positive and negative percentage relative to the paragraph
doc_total_sentiment_cats = list()    
for i in doc_total_sentiment:
    doc_total_sentiment_cats.append(i.cats)

# Create and print the average of the positive and negative evaluations for all the sentences of the dataset  
total_positive = (sum(d['positive'] for d in doc_total_sentiment_cats)/len(doc_total_sentiment_cats))
total_negative = (sum(d['negative'] for d in doc_total_sentiment_cats)/len(doc_total_sentiment_cats))

print(len(doc_total_sentiment_cats))
print("Positive sentiment: " + str(total_positive))
print("Negative sentiment: " + str(total_negative))

1878
Positive sentiment: 0.6621691549101224
Negative sentiment: 0.3378308451552449


### Loaded_Language

This section will focus on calculate the mean of the positive and negative sentiment for each paragraph labled as Loaded_Language.

In [287]:
doc_loaded_language = list()
for i in X_loaded_language:
    doc_loaded_language.append(nlp(i))

In [288]:
# perform spacy sentiment analysis for each paragraph
doc_loaded_language_sentiment = list()
for i in doc_loaded_language:
    doc_loaded_language_sentiment.append(nlp_sa(i))
    
# create an empty list to which I append, for each paragraph, a dictionary containing the positive and negative percentage relative to the paragraph
doc_loaded_language_cats = list()    
for i in doc_loaded_language_sentiment:
    doc_loaded_language_cats.append(i.cats)

# create and print the average of the positive and negative evaluations for all the sentences of the dataset 
loaded_language_positive = (sum(d['positive'] for d in doc_loaded_language_cats)/len(doc_loaded_language_cats))
loaded_language_negative = (sum(d['negative'] for d in doc_loaded_language_cats)/len(doc_loaded_language_cats))

print(len(doc_loaded_language_cats))
print("Positive sentiment: " + str(loaded_language_positive))
print("Negative sentiment: " + str(loaded_language_negative))

806
Positive sentiment: 0.6547903497961741
Negative sentiment: 0.3452096506258558


### Name_Calling-Labeling

This section will focus on calculate the mean of the positive and negative sentiment for each paragraph labled as Name_Calling-Labeling.

In [289]:
doc_name_calling = list()
for i in X_name_calling:
    doc_name_calling.append(nlp(i))

In [290]:
# Perform spacy sentiment analysis for each paragraph.
doc_name_calling_sentiment = list()
for i in doc_name_calling:
    doc_name_calling_sentiment.append(nlp_sa(i))
    
# Create an empty list to which I append, for each paragraph, a dictionary containing the positive and negative percentage relative to the paragraph
doc_name_calling_cats = list()    
for i in doc_name_calling_sentiment:
    doc_name_calling_cats.append(i.cats)

# Create and print the average of the positive and negative evaluations for all the sentences of the dataset 
name_calling_positive = (sum(d['positive'] for d in doc_name_calling_cats)/len(doc_name_calling_cats))
name_calling_negative = (sum(d['negative'] for d in doc_name_calling_cats)/len(doc_name_calling_cats))

print(len(doc_name_calling_cats))
print("Positive sentiment: " + str(name_calling_positive))
print("Negative sentiment: " + str(name_calling_negative))

318
Positive sentiment: 0.7093578578388825
Negative sentiment: 0.29064214305991704


### Repetition

This section will focus on calculate the mean of the positive and negative sentiment for each paragraph labled as Repetition.

In [291]:
doc_repetition = list()
for i in X_repetition:
    doc_repetition.append(nlp(i))

In [292]:
# Perform spacy sentiment analysis for each paragraph.
doc_repetition_sentiment = list()
for i in doc_repetition:
    doc_repetition_sentiment.append(nlp_sa(i))
    
# Create an empty list to which I append, for each paragraph, a dictionary containing the positive and negative percentage relative to the paragraph
doc_repetition_cats = list()    
for i in doc_repetition_sentiment:
    doc_repetition_cats.append(i.cats)

# Create and print the average of the positive and negative evaluations for all the sentences of the dataset  
repetition_positive = (sum(d['positive'] for d in doc_repetition_cats)/len(doc_repetition_cats))
repetition_negative = (sum(d['negative'] for d in doc_repetition_cats)/len(doc_repetition_cats))

print(len(doc_repetition_cats))
print("Positive sentiment: " + str(repetition_positive))
print("Negative sentiment: " + str(repetition_negative))

218
Positive sentiment: 0.6673896217112268
Negative sentiment: 0.33261037515658576


### Doubt

This section will focus on calculate the mean of the positive and negative sentiment for each paragraph labled as Doubt.

In [293]:
doc_doubt = list()
for i in X_doubt:
    doc_doubt.append(nlp(i))

In [294]:
# Perform spacy sentiment analysis for each paragraph.
doc_doubt_sentiment = list()
for i in doc_doubt:
    doc_doubt_sentiment.append(nlp_sa(i))
    
# Create an empty list to which I append, for each paragraph, a dictionary containing the positive and negative percentage relative to the paragraph
doc_doubt_cats = list()    
for i in doc_doubt_sentiment:
    doc_doubt_cats.append(i.cats)

# Create and print the average of the positive and negative evaluations for all the sentences of the dataset  
doubt_positive = (sum(d['positive'] for d in doc_doubt_cats)/len(doc_doubt_cats))
doubt_negative = (sum(d['negative'] for d in doc_doubt_cats)/len(doc_doubt_cats))

print(len(doc_doubt_cats))
print("Positive sentiment: " + str(doubt_positive))
print("Negative sentiment: " + str(doubt_negative))

210
Positive sentiment: 0.6209130356875026
Negative sentiment: 0.37908696565167926


### Appeal_to_Fear-Prejudice

This section will focus on calculate the mean of the positive and negative sentiment for each paragraph labled as Appeal_to_Fear-Prejudice.

In [295]:
doc_prejudice = list()
for i in X_prejudice:
    doc_prejudice.append(nlp(i))

In [296]:
# Perform spacy sentiment analysis for each paragraph.
doc_prejudice_sentiment = list()
for i in doc_prejudice:
    doc_prejudice_sentiment.append(nlp_sa(i))
    
# Create an empty list to which I append, for each paragraph, a dictionary containing the positive and negative percentage relative to the paragraph
doc_prejudice_cats = list()    
for i in doc_prejudice_sentiment:
    doc_prejudice_cats.append(i.cats)

# Create and print the average of the positive and negative evaluations for all the sentences of the dataset
prejudice_positive = (sum(d['positive'] for d in doc_prejudice_cats)/len(doc_prejudice_cats))
prejudice_negative = (sum(d['negative'] for d in doc_prejudice_cats)/len(doc_prejudice_cats))

print(len(doc_prejudice_cats))
print("Positive sentiment: " + str(prejudice_positive))
print("Negative sentiment: " + str(prejudice_negative))

122
Positive sentiment: 0.6366185784187831
Negative sentiment: 0.3633814182179653


### Exaggeration-Minimisation

This section will focus on calculate the mean of the positive and negative sentiment for each paragraph labled as Exaggeration-Minimisation.

In [297]:
doc_ex_min = list()
for i in X_ex_min:
    doc_ex_min.append(nlp(i))

In [298]:
# Perform spacy sentiment analysis for each paragraph.
doc_ex_min_sentiment = list()
for i in doc_ex_min:
    doc_ex_min_sentiment.append(nlp_sa(i))
    
# Create an empty list to which I append, for each paragraph, a dictionary containing the positive and negative percentage relative to the paragraph
doc_ex_min_cats = list()    
for i in doc_ex_min_sentiment:
    doc_ex_min_cats.append(i.cats)

# Create and print the average of the positive and negative evaluations for all the sentences of the dataset 
ex_min_positive = (sum(d['positive'] for d in doc_ex_min_cats)/len(doc_ex_min_cats))
ex_min_negative = (sum(d['negative'] for d in doc_ex_min_cats)/len(doc_ex_min_cats))

print(len(doc_ex_min_cats))
print("Positive sentiment: " + str(ex_min_positive))
print("Negative sentiment: " + str(ex_min_negative))

102
Positive sentiment: 0.6661935986263292
Negative sentiment: 0.3338064034628161


### Flag_Waving

This section will focus on calculate the mean of the positive and negative sentiment for each paragraph labled as Flag_Waving.

In [299]:
doc_flag_waving = list()
for i in X_flag_waving:
    doc_flag_waving.append(nlp(i))

In [300]:
# Perform spacy sentiment analysis for each paragraph
doc_flag_waving_sentiment = list()
for i in doc_flag_waving:
    doc_flag_waving_sentiment.append(nlp_sa(i))
    
# Create an empty list to which I append, for each paragraph, a dictionary containing the positive and negative percentage relative to the paragraph
doc_flag_waving_cats = list()    
for i in doc_flag_waving_sentiment:
    doc_flag_waving_cats.append(i.cats)

# Create and print the average of the positive and negative evaluations for all the sentences of the dataset   
flag_waving_positive = (sum(d['positive'] for d in doc_flag_waving_cats)/len(doc_flag_waving_cats))
flag_waving_negative = (sum(d['negative'] for d in doc_flag_waving_cats)/len(doc_flag_waving_cats))

print(len(doc_flag_waving_cats))
print("Positive sentiment: " + str(flag_waving_positive))
print("Negative sentiment: " + str(flag_waving_negative))

102
Positive sentiment: 0.6736761131208646
Negative sentiment: 0.326323887816365


## Results summary

In [301]:
print("TOTAL")
print("Positive sentiment: " + str(total_positive))
print("Negative sentiment: " + str(total_negative))
print("\nLOADED LANGUAGE")
print("Positive sentiment: " + str(loaded_language_positive))
print("Negative sentiment: " + str(loaded_language_negative))
print("\nNAME CALLING - LABELING")
print("Positive sentiment: " + str(name_calling_positive))
print("Negative sentiment: " + str(name_calling_negative))
print("\nREPETITION")
print("Positive sentiment: " + str(repetition_positive))
print("Negative sentiment: " + str(repetition_negative))
print("\nDOUBT")
print("Positive sentiment: " + str(doubt_positive))
print("Negative sentiment: " + str(doubt_negative))
print("\nPREJUDICE")
print("Positive sentiment: " + str(prejudice_positive))
print("Negative sentiment: " + str(prejudice_negative))
print("\nEXAGGERATION - MINIMISATION")
print("Positive sentiment: " + str(ex_min_positive))
print("Negative sentiment: " + str(ex_min_negative))
print("\nFLAG WAVING")
print("Positive sentiment: " + str(flag_waving_positive))
print("Negative sentiment: " + str(flag_waving_negative))

TOTAL
Positive sentiment: 0.6621691549101224
Negative sentiment: 0.3378308451552449

LOADED LANGUAGE
Positive sentiment: 0.6547903497961741
Negative sentiment: 0.3452096506258558

NAME CALLING - LABELING
Positive sentiment: 0.7093578578388825
Negative sentiment: 0.29064214305991704

REPETITION
Positive sentiment: 0.6673896217112268
Negative sentiment: 0.33261037515658576

DOUBT
Positive sentiment: 0.6209130356875026
Negative sentiment: 0.37908696565167926

PREJUDICE
Positive sentiment: 0.6366185784187831
Negative sentiment: 0.3633814182179653

EXAGGERATION - MINIMISATION
Positive sentiment: 0.6661935986263292
Negative sentiment: 0.3338064034628161

FLAG WAVING
Positive sentiment: 0.6736761131208646
Negative sentiment: 0.326323887816365
