# Reading data

In [27]:
import pandas as pd
from langdetect import detect
from nltk.tokenize import sent_tokenize
import warnings
warnings.filterwarnings("ignore")
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from scipy import spatial
from itertools import chain
from nltk.corpus import stopwords
import gensim

In [11]:
data = pd.read_csv('articles_bbc_2018_01_30.csv')

In [12]:
data.shape

(309, 2)

In [13]:
data = data.dropna().reset_index(drop=True)
data.shape

(308, 2)

In [14]:
data.head

<bound method NDFrame.head of                                               articles lang
0    Image copyright PA/EPA Image caption Oligarch ...   en
1    Husband admits killing French jogger\r\n\r\nTh...   en
2    Media playback is unsupported on your device M...   en
3    Manchester City's Leroy Sane is ruled out for ...   en
4    Image copyright AFP Image caption Sebastien Br...   en
..                                                 ...  ...
303  فيديو\r\n\r\nكيف تعبر الحدود...مثل الفيل؟!\r\n...   ar
304  بالصور\r\n\r\nمعالم لندن تحت الأضواء\r\n\r\nمع...   ar
305  يقدم لكم تلفزيون بي بي سي العربي الأخبار والأخ...   ar
306  موجات FM\r\n\r\nنبث إرسالنا على موجات إف إم في...   ar
307  Hi I am the head of product for BBC News Onlin...   en

[308 rows x 2 columns]>

# Data Cleaning
#### Keeping English articles

In [15]:
data['lang'] = data.articles.apply(detect)

In [16]:
data.lang.value_counts()

en    257
fa      9
fr      7
id      5
hi      4
ru      4
ar      4
vi      4
uk      4
sw      3
es      2
pt      2
tr      2
de      1
Name: lang, dtype: int64

In [17]:
data = data.loc[data.lang=='en']

#### Tokenization

In [18]:
data['sentences'] = data.articles.apply(sent_tokenize)
data['sentences'].head(1).tolist()[0][:3] 

['Image copyright PA/EPA Image caption Oligarch Roman Abramovich (l) and PM Dmitry Medvedev are on the list\r\n\r\nRussian President Vladimir Putin says a list of officials and businessmen close to the Kremlin published by the US has in effect targeted all Russian people.',
 'The list names 210 top Russians as part of a sanctions law aimed at punishing Moscow for meddling in the US election.',
 'However, the US stressed those named were not subject to new sanctions.']

In [19]:
data['tokens_sentences'] = data['sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
print(data['tokens_sentences'].head(1).tolist()[0][:3])

[['Image', 'copyright', 'PA/EPA', 'Image', 'caption', 'Oligarch', 'Roman', 'Abramovich', '(', 'l', ')', 'and', 'PM', 'Dmitry', 'Medvedev', 'are', 'on', 'the', 'list', 'Russian', 'President', 'Vladimir', 'Putin', 'says', 'a', 'list', 'of', 'officials', 'and', 'businessmen', 'close', 'to', 'the', 'Kremlin', 'published', 'by', 'the', 'US', 'has', 'in', 'effect', 'targeted', 'all', 'Russian', 'people', '.'], ['The', 'list', 'names', '210', 'top', 'Russians', 'as', 'part', 'of', 'a', 'sanctions', 'law', 'aimed', 'at', 'punishing', 'Moscow', 'for', 'meddling', 'in', 'the', 'US', 'election', '.'], ['However', ',', 'the', 'US', 'stressed', 'those', 'named', 'were', 'not', 'subject', 'to', 'new', 'sanctions', '.']]


#### Lemmatizing with POS tagging

In [20]:
data['POS_tokens'] = data['tokens_sentences'].apply(lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])
print(data['POS_tokens'].head(1).tolist()[0][:3])

[[('Image', 'NN'), ('copyright', 'NN'), ('PA/EPA', 'NNP'), ('Image', 'NNP'), ('caption', 'NN'), ('Oligarch', 'NNP'), ('Roman', 'NNP'), ('Abramovich', 'NNP'), ('(', '('), ('l', 'NN'), (')', ')'), ('and', 'CC'), ('PM', 'NNP'), ('Dmitry', 'NNP'), ('Medvedev', 'NNP'), ('are', 'VBP'), ('on', 'IN'), ('the', 'DT'), ('list', 'NN'), ('Russian', 'NNP'), ('President', 'NNP'), ('Vladimir', 'NNP'), ('Putin', 'NNP'), ('says', 'VBZ'), ('a', 'DT'), ('list', 'NN'), ('of', 'IN'), ('officials', 'NNS'), ('and', 'CC'), ('businessmen', 'NNS'), ('close', 'RB'), ('to', 'TO'), ('the', 'DT'), ('Kremlin', 'NNP'), ('published', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('US', 'NNP'), ('has', 'VBZ'), ('in', 'IN'), ('effect', 'NN'), ('targeted', 'VBN'), ('all', 'DT'), ('Russian', 'JJ'), ('people', 'NNS'), ('.', '.')], [('The', 'DT'), ('list', 'NN'), ('names', 'RB'), ('210', 'CD'), ('top', 'JJ'), ('Russians', 'NNPS'), ('as', 'IN'), ('part', 'NN'), ('of', 'IN'), ('a', 'DT'), ('sanctions', 'NNS'), ('law', 'NN'), ('aimed', 

In [21]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [22]:
# Lemmatizing each word with its POS tag, in each sentence
data['tokens_sentences_lemmatized'] = data['POS_tokens'].apply(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
        ] 
        for tokens_POS in list_tokens_POS
    ]
)

In [23]:
data['tokens_sentences_lemmatized'].head(1).tolist()[0][:3]

[['Image',
  'copyright',
  'PA/EPA',
  'Image',
  'caption',
  'Oligarch',
  'Roman',
  'Abramovich',
  '(',
  'l',
  ')',
  'and',
  'PM',
  'Dmitry',
  'Medvedev',
  'be',
  'on',
  'the',
  'list',
  'Russian',
  'President',
  'Vladimir',
  'Putin',
  'say',
  'a',
  'list',
  'of',
  'official',
  'and',
  'businessmen',
  'close',
  'to',
  'the',
  'Kremlin',
  'publish',
  'by',
  'the',
  'US',
  'have',
  'in',
  'effect',
  'target',
  'all',
  'Russian',
  'people',
  '.'],
 ['The',
  'list',
  'names',
  '210',
  'top',
  'Russians',
  'as',
  'part',
  'of',
  'a',
  'sanction',
  'law',
  'aim',
  'at',
  'punish',
  'Moscow',
  'for',
  'meddle',
  'in',
  'the',
  'US',
  'election',
  '.'],
 ['However',
  ',',
  'the',
  'US',
  'stress',
  'those',
  'name',
  'be',
  'not',
  'subject',
  'to',
  'new',
  'sanction',
  '.']]

#### Regrouping tokens and removing stop words

In [25]:
stopwords_verbs = ['say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', 'use', 'would', 'can']
stopwords_other = ['one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also', 'copyright', 'something']
my_stopwords = stopwords.words('English') + stopwords_verbs + stopwords_other

In [28]:
data['tokens'] = data['tokens_sentences_lemmatized'].map(lambda sentences: list(chain.from_iterable(sentences)))
data['tokens'] = data['tokens'].map(lambda tokens: [token.lower() for token in tokens if token.isalpha() 
                                                    and token.lower() not in my_stopwords and len(token)>1])
tokens = data['tokens'].tolist()

#### Preparing training data

In [29]:
def read_corpus(fname, tokens_only=False):
    for i, line in enumerate(fname):
            #print(line,i)
            yield gensim.models.doc2vec.TaggedDocument(line, [i])
                


In [30]:
train_corpus = list(read_corpus(tokens))

#### Training Model

In [31]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=20,alpha=0.025, min_alpha=0.00025,min_count=2, epochs=20)
model.build_vocab(train_corpus)

In [32]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

#### Testing Model

In [33]:
article1= open('ar3.txt')
first_text =article1.read()
article2 = open('ar4.txt')
second_text=article2.read()
#print(first_text)
f_tokens = gensim.utils.simple_preprocess(first_text)
s_tokens = gensim.utils.simple_preprocess(second_text)

#generate tokens
vec1 = model.infer_vector(f_tokens)#(test_corpus[44])
vec2 = model.infer_vector(s_tokens)#(train_corpus[209].words)

#find cosine similarity
similairty = spatial.distance.cosine(vec1, vec2)
print(1-similairty)

0.9957830905914307
