# Final Tutorial Notebook
#### Emily Gong, Robert Morrison

In this notebook we will explore the transcripts from UN General Debates. Our goal is to understand what differences exist between speakers of different language backgrounds

In [1]:
import numpy as np
import pandas as pd

import re

import requests
import bs4

import nltk
import spacy
from spacy import displacy #display word entities
import en_core_web_sm #language model
from collections import Counter
nlp = en_core_web_sm.load() 

# Below are libraries for LDA using gensim which is provides less control 
from nltk.corpus import stopwords #stop words to be filited out
from gensim import models, corpora #Used for LDA topic modeling
from sklearn.metrics.pairwise import euclidean_distances 
import pyLDAvis.gensim #python library for interactive topic model visualization
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

#Below are libraries for LDA using sklearn


from sklearn.feature_extraction.text import TfidfVectorizer

# Introduction

The United Nations is an international organization of independent states to promote peace and international cooperation and security. Currently, there are 193 soverign states and they meet every year in regular session on the Tuesday of the third week in September. 

# Loading Data
## Raw Data

In [2]:
data = pd.read_csv("un-general-debates.csv")
data.head()

Unnamed: 0,session,year,country,text
0,44,1989,MDV,﻿It is indeed a pleasure for me and the member...
1,44,1989,FIN,"﻿\nMay I begin by congratulating you. Sir, on ..."
2,44,1989,NER,"﻿\nMr. President, it is a particular pleasure ..."
3,44,1989,URY,﻿\nDuring the debate at the fortieth session o...
4,44,1989,ZWE,﻿I should like at the outset to express my del...


In [3]:
print(len(data))

7507


In [4]:
countries = data["country"].unique()
print(countries)
print(len(countries))

['MDV' 'FIN' 'NER' 'URY' 'ZWE' 'PHL' 'SDN' 'RUS' 'CHN' 'ESP' 'SUR' 'ARG'
 'SLV' 'MYS' 'NPL' 'PRT' 'COL' 'BLR' 'MAR' 'LCA' 'EGY' 'MEX' 'BEL' 'BRN'
 'RWA' 'CAN' 'ALB' 'GRC' 'KNA' 'GUY' 'LBR' 'ATG' 'MOZ' 'JPN' 'YDYE' 'GAB'
 'BGD' 'SWE' 'TUR' 'TCD' 'SYR' 'CMR' 'JAM' 'LUX' 'ITA' 'AGO' 'CRI' 'CSK'
 'BFA' 'MNG' 'BHR' 'HTI' 'OMN' 'CIV' 'TGO' 'CYP' 'MUS' 'MMR' 'ARE' 'GTM'
 'GRD' 'LBY' 'LKA' 'TZA' 'SGP' 'NOR' 'LAO' 'ISL' 'AFG' 'CHL' 'DMA' 'UKR'
 'KEN' 'BLZ' 'FRA' 'MLI' 'VCT' 'VEN' 'MLT' 'GHA' 'GIN' 'GBR' 'ISR' 'YUG'
 'BRB' 'IRQ' 'HUN' 'AUT' 'POL' 'GNB' 'BWA' 'MRT' 'SWZ' 'DNK' 'DOM' 'MDG'
 'NIC' 'BDI' 'CUB' 'IRN' 'PAK' 'SEN' 'BGR' 'YEM' 'STP' 'NLD' 'VUT' 'BOL'
 'PNG' 'SLB' 'DEU' 'ROU' 'KHM' 'TUN' 'BRA' 'IND' 'IDN' 'AUS' 'COD' 'HND'
 'GNQ' 'FJI' 'IRL' 'DZA' 'USA' 'LSO' 'GMB' 'PER' 'DDR' 'THA' 'JOR' 'COG'
 'NGA' 'ECU' 'SAU' 'QAT' 'SYC' 'ETH' 'TTO' 'PRY' 'VNM' 'NZL' 'PAN' 'MWI'
 'DJI' 'BEN' 'SOM' 'ZMB' 'CPV' 'BHS' 'KWT' 'UGA' 'COM' 'ZAF' 'LBN' 'SLE'
 'KOR' 'BIH' 'TON' 'EU' 'HRV' 'NRU' 'TUV' 'NAM' 'S

In [5]:
years = data["year"].unique()
print(years)
print(len(years))
# 46 Examples of each country

[1989 1970 2013 1985 2008 1991 1986 2002 1975 1996 2012 1997 1978 1988
 2010 1984 1995 2009 1971 1976 1983 1979 1999 2005 1987 1982 1998 2003
 2004 1980 2014 2011 1974 2015 1993 1977 1981 2000 1992 1990 1973 1994
 1972 2006 2007 2001]
46


## Scraping

In [6]:
# Getting a decoding table and access to each countries Wikipedia page
url = "https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3"
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text)

In [7]:
found = soup.find("div", class_="plainlist").find("ul")

In [8]:
lookup = {} # dict {country code : [Country Name, wikipedia link]}

for child in found.children:
    if isinstance(child, bs4.Tag):
        c_code = child.find("span").text
        link = child.find("a").get("href")
        country = child.find("a").text
        
        lookup[c_code] = [country, link]

lookup["CSK"] = ["Czechoslovakia", "/wiki/Czechoslovakia"]
lookup["YDYE"] = ["South Yemen", "/wiki/South_Yemen"]
lookup["YUG"] = ["Yugoslavia", "/wiki/Yugoslavia"]
lookup["DDR"] = ["East Germany", "/wiki/East_Germany"]
lookup["EU"] = ["European Union", "/wiki/European_Union"]

In [9]:
data["name"] = [lookup[code][0] for code in data["country"]]

# EDA

In [10]:
sample_text = data.loc[0]["text"]
sample_text = sample_text.replace(u'\ufeff', '')
print(sample_text[0:100])

It is indeed a pleasure for me and the members of my delegation to extend to Ambassador Garba our si


In [40]:
# nltk.word_tokenize(sample_text)
tokens = nltk.word_tokenize(sample_text.lower())
print(tokens[0:10])

['it', 'is', 'indeed', 'a', 'pleasure', 'for', 'me', 'and', 'the', 'members']


In [41]:
pos = nltk.pos_tag(tokens)
pos[:10]

[('it', 'PRP'),
 ('is', 'VBZ'),
 ('indeed', 'RB'),
 ('a', 'DT'),
 ('pleasure', 'NN'),
 ('for', 'IN'),
 ('me', 'PRP'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('members', 'NNS')]

In [42]:
# Don't use this bruh
chunk = nltk.ne_chunk(pos)
chunk[:20]

[('it', 'PRP'),
 ('is', 'VBZ'),
 ('indeed', 'RB'),
 ('a', 'DT'),
 ('pleasure', 'NN'),
 ('for', 'IN'),
 ('me', 'PRP'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('members', 'NNS'),
 ('of', 'IN'),
 ('my', 'PRP$'),
 ('delegation', 'NN'),
 ('to', 'TO'),
 ('extend', 'VB'),
 ('to', 'TO'),
 ('ambassador', 'VB'),
 ('garba', 'VB'),
 ('our', 'PRP$'),
 ('sincere', 'JJ')]

In [43]:
nlp = en_core_web_sm.load()
nlp = spacy.load('en_core_web_sm')
doc = nlp(sample_text)
print(len(doc.ents))

print([(X.text, X.label_) for X in doc.ents[:20]])

labels = [x.label_ for x in doc.ents]
Counter(labels)

items= [x.text for x in doc.ents]
Counter(items).most_common(3)

sentences = [x for x in doc.sents]
print(sentences[21])

displacy.render(nlp(str(sentences[21])), jupyter=True, style='ent')

142
[('the General Assembly', 'ORG'), ('Assembly', 'ORG'), ('the past year', 'DATE'), ('\n', 'GPE'), ('Dante Caputo', 'PERSON'), ('forty-third', 'CARDINAL'), ('the General Assembly', 'ORG'), ('\n', 'GPE'), ('As in previous years', 'TIME'), ('the United Nations', 'ORG'), ('the Charter of the United Nations', 'LAW'), ('Organization', 'ORG'), ('recent years', 'DATE'), ('Organization', 'ORG'), ('Today', 'DATE'), ('Recent years', 'DATE'), ('-Power', 'ORG'), ('\n', 'GPE'), ('Africa', 'LOC'), ('Namibia', 'GPE')]
The 1980s have witnessed one of the longest spells of growth for the industrialized countries, while the situation in the South, particularly in the least developed countries, continues to deteriorate.


In [44]:
vect = TfidfVectorizer()

tfidf = vect.fit_transform(data["text"])

In [45]:
tfidf

<7507x54892 sparse matrix of type '<class 'numpy.float64'>'
	with 6900203 stored elements in Compressed Sparse Row format>

In [46]:
# LDA Topic Modeling

In [47]:
num_topic = 2
stop_words = stopwords.words('english')
gensim_tokenized_data = []
# gensim needs the data to have removed stop words 
# Later: can create a method to produce clean data
gensim_tokenized_data.append([t.lower() for t in tokens if t not in stop_words and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)])
gensim_dictionary = corpora.Dictionary(gensim_tokenized_data)
# Convert document into the bag-of-words (BoW) format = list of (token_id, token_count) tuples.
gensim_corpus = [gensim_dictionary.doc2bow(text) for text in gensim_tokenized_data]
lda_model = models.LdaModel(corpus=gensim_corpus, num_topics=num_topic, id2word=gensim_dictionary)

In [48]:
# print("LDA Model:")
 
# for idx in range(num_topic):
#     # Print the first 10 most representative topics
#     print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
# print("=" * 20)

In [49]:
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar


In [50]:
pyLDAvis.gensim.prepare(lda_model, gensim_corpus, gensim_dictionary)

The left panel, labeld Intertopic Distance Map, circles represent different topics and the distance between them. Similar topics appear closer and the dissimilar topics farther. The relative size of a topic's circle in the plot corresponds to the relative frequency of the topic in the corpus. An individual topic may be selected for closer scrutiny by clicking on its circle, or entering its number in the "selected topic" box in the upper-left.


The right panel, include the bar chart of the top 30 terms. When no topic is selected in the plot on the left, the bar chart shows the top-30 most "salient" terms in the corpus. A term's saliency is a measure of both how frequent the term is in the corpus and how "distinctive" it is in distinguishing between different topics. Selecting each topic on the right, modifies the bar chart to show the "relevant" terms for the selected topic. Relevence is defined as in footer 2 and can be tuned by parameter  λ
 , smaller  λ
  gives higher weight to the term's distinctiveness while larger  λ
 s corresponds to probablity of the term occurance per topics.

Therefore, to get a better sense of terms per topic we'll use  λ
 =0.


In [35]:
vect.stop_words_

set()