# Final Tutorial Notebook
#### Emily Gong, Robert Morrison

In this notebook we will explore the transcripts from UN General Debates. Our goal is to understand what differences exist between speakers of different language backgrounds

In [1]:
import numpy as np
import pandas as pd

import requests
import bs4
import nltk
import spacy
from spacy import displacy
import en_core_web_sm
from collections import Counter
nlp = en_core_web_sm.load()

from sklearn.feature_extraction.text import TfidfVectorizer

# Loading Data
## Raw Data

In [2]:
data = pd.read_csv("un-general-debates.csv")
data.head()

Unnamed: 0,session,year,country,text
0,44,1989,MDV,﻿It is indeed a pleasure for me and the member...
1,44,1989,FIN,"﻿\nMay I begin by congratulating you. Sir, on ..."
2,44,1989,NER,"﻿\nMr. President, it is a particular pleasure ..."
3,44,1989,URY,﻿\nDuring the debate at the fortieth session o...
4,44,1989,ZWE,﻿I should like at the outset to express my del...


In [3]:
print(len(data))

7507


In [4]:
countries = data["country"].unique()
print(countries)
print(len(countries))

['MDV' 'FIN' 'NER' 'URY' 'ZWE' 'PHL' 'SDN' 'RUS' 'CHN' 'ESP' 'SUR' 'ARG'
 'SLV' 'MYS' 'NPL' 'PRT' 'COL' 'BLR' 'MAR' 'LCA' 'EGY' 'MEX' 'BEL' 'BRN'
 'RWA' 'CAN' 'ALB' 'GRC' 'KNA' 'GUY' 'LBR' 'ATG' 'MOZ' 'JPN' 'YDYE' 'GAB'
 'BGD' 'SWE' 'TUR' 'TCD' 'SYR' 'CMR' 'JAM' 'LUX' 'ITA' 'AGO' 'CRI' 'CSK'
 'BFA' 'MNG' 'BHR' 'HTI' 'OMN' 'CIV' 'TGO' 'CYP' 'MUS' 'MMR' 'ARE' 'GTM'
 'GRD' 'LBY' 'LKA' 'TZA' 'SGP' 'NOR' 'LAO' 'ISL' 'AFG' 'CHL' 'DMA' 'UKR'
 'KEN' 'BLZ' 'FRA' 'MLI' 'VCT' 'VEN' 'MLT' 'GHA' 'GIN' 'GBR' 'ISR' 'YUG'
 'BRB' 'IRQ' 'HUN' 'AUT' 'POL' 'GNB' 'BWA' 'MRT' 'SWZ' 'DNK' 'DOM' 'MDG'
 'NIC' 'BDI' 'CUB' 'IRN' 'PAK' 'SEN' 'BGR' 'YEM' 'STP' 'NLD' 'VUT' 'BOL'
 'PNG' 'SLB' 'DEU' 'ROU' 'KHM' 'TUN' 'BRA' 'IND' 'IDN' 'AUS' 'COD' 'HND'
 'GNQ' 'FJI' 'IRL' 'DZA' 'USA' 'LSO' 'GMB' 'PER' 'DDR' 'THA' 'JOR' 'COG'
 'NGA' 'ECU' 'SAU' 'QAT' 'SYC' 'ETH' 'TTO' 'PRY' 'VNM' 'NZL' 'PAN' 'MWI'
 'DJI' 'BEN' 'SOM' 'ZMB' 'CPV' 'BHS' 'KWT' 'UGA' 'COM' 'ZAF' 'LBN' 'SLE'
 'KOR' 'BIH' 'TON' 'EU' 'HRV' 'NRU' 'TUV' 'NAM' 'S

In [5]:
years = data["year"].unique()
print(years)
print(len(years))
# 46 Examples of each country

[1989 1970 2013 1985 2008 1991 1986 2002 1975 1996 2012 1997 1978 1988
 2010 1984 1995 2009 1971 1976 1983 1979 1999 2005 1987 1982 1998 2003
 2004 1980 2014 2011 1974 2015 1993 1977 1981 2000 1992 1990 1973 1994
 1972 2006 2007 2001]
46


## Scraping

In [6]:
# Getting a decoding table and access to each countries Wikipedia page
url = "https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3"
r = requests.get(url)
soup = bs4.BeautifulSoup(r.text)

In [7]:
found = soup.find("div", class_="plainlist").find("ul")

In [8]:
lookup = {} # dict {country code : [Country Name, wikipedia link]}

for child in found.children:
    if isinstance(child, bs4.Tag):
        c_code = child.find("span").text
        link = child.find("a").get("href")
        country = child.find("a").text
        
        lookup[c_code] = [country, link]

lookup["CSK"] = ["Czechoslovakia", "/wiki/Czechoslovakia"]
lookup["YDYE"] = ["South Yemen", "/wiki/South_Yemen"]
lookup["YUG"] = ["Yugoslavia", "/wiki/Yugoslavia"]
lookup["DDR"] = ["East Germany", "/wiki/East_Germany"]
lookup["EU"] = ["European Union", "/wiki/European_Union"]

In [9]:
data["name"] = [lookup[code][0] for code in data["country"]]

# EDA

In [10]:
sample_text = data.loc[0]["text"]
sample_text = sample_text.replace(u'\ufeff', '')
print(sample_text[0:100])

It is indeed a pleasure for me and the members of my delegation to extend to Ambassador Garba our si


In [11]:
# nltk.word_tokenize(sample_text)
tokens = nltk.word_tokenize(sample_text)
print(tokens[0:10])

['It', 'is', 'indeed', 'a', 'pleasure', 'for', 'me', 'and', 'the', 'members']


In [12]:
pos = nltk.pos_tag(tokens)
pos[:10]

[('It', 'PRP'),
 ('is', 'VBZ'),
 ('indeed', 'RB'),
 ('a', 'DT'),
 ('pleasure', 'NN'),
 ('for', 'IN'),
 ('me', 'PRP'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('members', 'NNS')]

In [13]:
# Don't use this bruh
chunk = nltk.ne_chunk(pos)
chunk[:20]

[('It', 'PRP'),
 ('is', 'VBZ'),
 ('indeed', 'RB'),
 ('a', 'DT'),
 ('pleasure', 'NN'),
 ('for', 'IN'),
 ('me', 'PRP'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('members', 'NNS'),
 ('of', 'IN'),
 ('my', 'PRP$'),
 ('delegation', 'NN'),
 ('to', 'TO'),
 ('extend', 'VB'),
 ('to', 'TO'),
 Tree('ORGANIZATION', [('Ambassador', 'NNP'), ('Garba', 'NNP')]),
 ('our', 'PRP$'),
 ('sincere', 'JJ'),
 ('congratulations', 'NNS')]

In [22]:
nlp = en_core_web_sm.load()
nlp = spacy.load('en_core_web_sm')
doc = nlp(sample_text)
print(len(doc.ents))

print([(X.text, X.label_) for X in doc.ents[:20]])

labels = [x.label_ for x in doc.ents]
Counter(labels)

items= [x.text for x in doc.ents]
Counter(items).most_common(3)

sentences = [x for x in doc.sents]
print(sentences[21])

displacy.render(nlp(str(sentences[21])), jupyter=True, style='ent')

142
[('the General Assembly', 'ORG'), ('Assembly', 'ORG'), ('the past year', 'DATE'), ('\n', 'GPE'), ('Dante Caputo', 'PERSON'), ('forty-third', 'CARDINAL'), ('the General Assembly', 'ORG'), ('\n', 'GPE'), ('As in previous years', 'TIME'), ('the United Nations', 'ORG'), ('the Charter of the United Nations', 'LAW'), ('Organization', 'ORG'), ('recent years', 'DATE'), ('Organization', 'ORG'), ('Today', 'DATE'), ('Recent years', 'DATE'), ('-Power', 'ORG'), ('\n', 'GPE'), ('Africa', 'LOC'), ('Namibia', 'GPE')]
The 1980s have witnessed one of the longest spells of growth for the industrialized countries, while the situation in the South, particularly in the least developed countries, continues to deteriorate.


In [15]:
vect = TfidfVectorizer()

tfidf = vect.fit_transform(data["text"])

In [16]:
tfidf

<7507x54892 sparse matrix of type '<class 'numpy.float64'>'
	with 6900203 stored elements in Compressed Sparse Row format>

In [17]:
vect.stop_words_

set()