# Reading data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('articles_bbc_2018_01_30.csv')

In [3]:
data.shape

(309, 2)

In [4]:
data = data.dropna().reset_index(drop=True)

In [5]:
data.shape

(308, 2)

In [6]:
data.head

<bound method NDFrame.head of                                               articles lang
0    Image copyright PA/EPA Image caption Oligarch ...   en
1    Husband admits killing French jogger\r\n\r\nTh...   en
2    Media playback is unsupported on your device M...   en
3    Manchester City's Leroy Sane is ruled out for ...   en
4    Image copyright AFP Image caption Sebastien Br...   en
..                                                 ...  ...
303  فيديو\r\n\r\nكيف تعبر الحدود...مثل الفيل؟!\r\n...   ar
304  بالصور\r\n\r\nمعالم لندن تحت الأضواء\r\n\r\nمع...   ar
305  يقدم لكم تلفزيون بي بي سي العربي الأخبار والأخ...   ar
306  موجات FM\r\n\r\nنبث إرسالنا على موجات إف إم في...   ar
307  Hi I am the head of product for BBC News Onlin...   en

[308 rows x 2 columns]>

# Cleaning

#### Keeping English articles

In [7]:
from langdetect import detect

In [8]:
data['lang'] = data.articles.apply(detect)

In [9]:
data.lang.value_counts()

en    257
fa      9
fr      7
id      5
uk      4
ar      4
vi      4
hi      4
ru      4
sw      3
tr      2
pt      2
es      2
de      1
Name: lang, dtype: int64

In [10]:
data = data.loc[data.lang=='en']

#### Tokenization

In [11]:
from nltk.tokenize import sent_tokenize
import warnings
warnings.filterwarnings("ignore")

In [12]:
data['sentences'] = data.articles.apply(sent_tokenize)
data['sentences'].head(1).tolist()[0][:3] # Print the first 3 sentences of the 1st article

['Image copyright PA/EPA Image caption Oligarch Roman Abramovich (l) and PM Dmitry Medvedev are on the list\r\n\r\nRussian President Vladimir Putin says a list of officials and businessmen close to the Kremlin published by the US has in effect targeted all Russian people.',
 'The list names 210 top Russians as part of a sanctions law aimed at punishing Moscow for meddling in the US election.',
 'However, the US stressed those named were not subject to new sanctions.']

In [13]:
from nltk.tokenize import word_tokenize

In [14]:
data['tokens_sentences'] = data['sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
print(data['tokens_sentences'].head(1).tolist()[0][:3])

[['Image', 'copyright', 'PA/EPA', 'Image', 'caption', 'Oligarch', 'Roman', 'Abramovich', '(', 'l', ')', 'and', 'PM', 'Dmitry', 'Medvedev', 'are', 'on', 'the', 'list', 'Russian', 'President', 'Vladimir', 'Putin', 'says', 'a', 'list', 'of', 'officials', 'and', 'businessmen', 'close', 'to', 'the', 'Kremlin', 'published', 'by', 'the', 'US', 'has', 'in', 'effect', 'targeted', 'all', 'Russian', 'people', '.'], ['The', 'list', 'names', '210', 'top', 'Russians', 'as', 'part', 'of', 'a', 'sanctions', 'law', 'aimed', 'at', 'punishing', 'Moscow', 'for', 'meddling', 'in', 'the', 'US', 'election', '.'], ['However', ',', 'the', 'US', 'stressed', 'those', 'named', 'were', 'not', 'subject', 'to', 'new', 'sanctions', '.']]


#### Lemmatizing with POS tagging

In [15]:
from nltk import pos_tag

In [16]:
data['POS_tokens'] = data['tokens_sentences'].apply(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])
print(data['POS_tokens'].head(1).tolist()[0][:3])

[[('Image', 'NN'), ('copyright', 'NN'), ('PA/EPA', 'NNP'), ('Image', 'NNP'), ('caption', 'NN'), ('Oligarch', 'NNP'), ('Roman', 'NNP'), ('Abramovich', 'NNP'), ('(', '('), ('l', 'NN'), (')', ')'), ('and', 'CC'), ('PM', 'NNP'), ('Dmitry', 'NNP'), ('Medvedev', 'NNP'), ('are', 'VBP'), ('on', 'IN'), ('the', 'DT'), ('list', 'NN'), ('Russian', 'NNP'), ('President', 'NNP'), ('Vladimir', 'NNP'), ('Putin', 'NNP'), ('says', 'VBZ'), ('a', 'DT'), ('list', 'NN'), ('of', 'IN'), ('officials', 'NNS'), ('and', 'CC'), ('businessmen', 'NNS'), ('close', 'RB'), ('to', 'TO'), ('the', 'DT'), ('Kremlin', 'NNP'), ('published', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('US', 'NNP'), ('has', 'VBZ'), ('in', 'IN'), ('effect', 'NN'), ('targeted', 'VBN'), ('all', 'DT'), ('Russian', 'JJ'), ('people', 'NNS'), ('.', '.')], [('The', 'DT'), ('list', 'NN'), ('names', 'RB'), ('210', 'CD'), ('top', 'JJ'), ('Russians', 'NNPS'), ('as', 'IN'), ('part', 'NN'), ('of', 'IN'), ('a', 'DT'), ('sanctions', 'NNS'), ('law', 'NN'), ('aimed', 

In [17]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [18]:
# Lemmatizing each word with its POS tag, in each sentence
data['tokens_sentences_lemmatized'] = data['POS_tokens'].apply(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
        ] 
        for tokens_POS in list_tokens_POS
    ]
)

In [19]:
data['tokens_sentences_lemmatized'].head(1).tolist()[0][:3]

[['Image',
  'copyright',
  'PA/EPA',
  'Image',
  'caption',
  'Oligarch',
  'Roman',
  'Abramovich',
  '(',
  'l',
  ')',
  'and',
  'PM',
  'Dmitry',
  'Medvedev',
  'be',
  'on',
  'the',
  'list',
  'Russian',
  'President',
  'Vladimir',
  'Putin',
  'say',
  'a',
  'list',
  'of',
  'official',
  'and',
  'businessmen',
  'close',
  'to',
  'the',
  'Kremlin',
  'publish',
  'by',
  'the',
  'US',
  'have',
  'in',
  'effect',
  'target',
  'all',
  'Russian',
  'people',
  '.'],
 ['The',
  'list',
  'names',
  '210',
  'top',
  'Russians',
  'as',
  'part',
  'of',
  'a',
  'sanction',
  'law',
  'aim',
  'at',
  'punish',
  'Moscow',
  'for',
  'meddle',
  'in',
  'the',
  'US',
  'election',
  '.'],
 ['However',
  ',',
  'the',
  'US',
  'stress',
  'those',
  'name',
  'be',
  'not',
  'subject',
  'to',
  'new',
  'sanction',
  '.']]

#### Regrouping tokens and removing stop words

In [20]:
from nltk.corpus import stopwords
stopwords_verbs = ['say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', 'use', 'would', 'can']
stopwords_other = ['one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also', 'copyright', 'something']
my_stopwords = stopwords.words('English') + stopwords_verbs + stopwords_other

In [21]:
from itertools import chain # to flatten list of sentences of tokens into list of tokens

In [22]:
data['tokens'] = data['tokens_sentences_lemmatized'].map(lambda sentences: list(chain.from_iterable(sentences)))
data['tokens'] = data['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha() 
                                                    and token.lower() not in my_stopwords and len(token)>1])

In [23]:
data['tokens'].head(1).tolist()[0][:30]

['oligarch',
 'roman',
 'abramovich',
 'pm',
 'dmitry',
 'medvedev',
 'list',
 'russian',
 'president',
 'vladimir',
 'putin',
 'list',
 'official',
 'businessmen',
 'close',
 'kremlin',
 'publish',
 'us',
 'effect',
 'target',
 'russian',
 'people',
 'list',
 'names',
 'top',
 'russians',
 'part',
 'sanction',
 'law',
 'aim']

# LDA

## Data preparation

#### Prepare bi-grams and tri-grams

In [24]:
from gensim.models import Phrases

In [25]:
tokens = data['tokens'].tolist()
bigram_model = Phrases(tokens)
trigram_model = Phrases(bigram_model[tokens], min_count=1)
tokens = list(trigram_model[bigram_model[tokens]])

#### Prepare objects for LDA gensim implementation

In [26]:
from gensim import corpora

In [27]:
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

## Running LDA

In [28]:
from gensim import models
import numpy as np

In [29]:
np.random.seed(123456)
num_topics = 20
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

Wall time: 1.06 s


## Quick exploration of LDA results

#### Looking at topics

In [30]:
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20):
    print(str(i)+": "+ topic)
    print()

0: 0.012*"specie" + 0.010*"prey" + 0.007*"act" + 0.006*"animal" + 0.006*"help" + 0.005*"back" + 0.005*"find" + 0.005*"become" + 0.005*"behaviour" + 0.005*"number" + 0.005*"area" + 0.005*"include" + 0.004*"order" + 0.004*"kill" + 0.004*"approach" + 0.004*"however" + 0.004*"move" + 0.004*"host" + 0.004*"evolve" + 0.004*"well"

1: 0.010*"show" + 0.008*"game" + 0.007*"another" + 0.007*"light" + 0.007*"night" + 0.006*"find" + 0.006*"could" + 0.006*"predator" + 0.006*"animal" + 0.006*"give" + 0.006*"images" + 0.006*"question" + 0.006*"however" + 0.006*"group" + 0.006*"live" + 0.006*"time" + 0.005*"transparent" + 0.005*"drug" + 0.005*"eye" + 0.005*"call"

2: 0.006*"first" + 0.006*"people" + 0.006*"work" + 0.005*"city" + 0.005*"time" + 0.005*"year" + 0.004*"day" + 0.004*"part" + 0.004*"find" + 0.004*"could" + 0.004*"us" + 0.003*"run" + 0.003*"restaurant" + 0.003*"plan" + 0.003*"home" + 0.003*"start" + 0.003*"tunnel" + 0.003*"include" + 0.003*"bring" + 0.003*"dish"

3: 0.014*"separatist" + 0.01

#### Allocating topics to documents

In [31]:
print(data.articles.loc[0][:500])

Image copyright PA/EPA Image caption Oligarch Roman Abramovich (l) and PM Dmitry Medvedev are on the list

Russian President Vladimir Putin says a list of officials and businessmen close to the Kremlin published by the US has in effect targeted all Russian people.

The list names 210 top Russians as part of a sanctions law aimed at punishing Moscow for meddling in the US election.

However, the US stressed those named were not subject to new sanctions.

Mr Putin said the list was an unfr


In [32]:
lda_model[corpus[0]]

[(15, 0.99830645)]

# Predicting topics on unseen document

In [37]:
document = 'open("MyFile.txt","a")'
tokens = word_tokenize(document)
topics = lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=20)
print([('Topic # : %s, Probability : %s, Topics : %s')%(el[0], round(el[1],2), topics[el[0]][1]) for el in lda_model[dictionary_LDA.doc2bow(tokens)]])

['Topic # : 7, Probability : 0.84, Topics : 0.011*"news" + 0.010*"ant" + 0.008*"twitter" + 0.007*"single" + 0.007*"live" + 0.006*"name" + 0.006*"specie" + 0.006*"small" + 0.006*"people" + 0.006*"story" + 0.005*"welcome" + 0.005*"mate" + 0.005*"effort" + 0.005*"alert" + 0.005*"receive" + 0.005*"power" + 0.005*"millipede" + 0.005*"consider" + 0.005*"pollution" + 0.005*"tiny"']
