In [1]:
import os
import copy
import nltk
import random
import pickle, json
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from stop_words import get_stop_words
from nltk.tokenize import TweetTokenizer
import re

from ukrainian_stemmer import UkrainianStemmer

In [2]:
stemmer = UkrainianStemmer()
all_stopwords = []
all_stopwords.extend(get_stop_words("ukrainian"))
all_stopwords.extend(get_stop_words("english"))
all_stopwords.extend(get_stop_words("russian"))

In [3]:
categories = dict()
with open("untitled.txt", 'r', encoding='utf-8') as f:
    for i in f.readlines():
        if " = " in i:
            tup = i.strip().split(" = ")
            categories[tup[0]] = tup[1]
print(categories)            

{'Олег Ляшко': 'dataOVLiashkoEXT.json', 'Михайло Ткач': 'dataMychailoTkachEXT.json', 'Петро Порошенко': 'dataPetroPoroshenko.json', 'Роман Скрипін': 'dataSkrypinEXT.json', 'Павло Шеремета': 'dataSheremetaEXT.json', 'Михайло Саакашвілі': 'dataSaakashviliEXT.json', 'Майкл Щур': 'dataMichaelSchur.json'}


In [4]:
def preprocess(file):
    cur_tweets = file.split('"]["')
    for i in range(len(cur_tweets)):
#         print(cur_tweets[i])
        # З Прем'єр-міністром Баварії Хорстом Зеєхофером говорили про збереження санкційного тиску на Москву https://t.co/r0RlPemAUH
        cur_tweets[i] = cur_tweets[i].replace("\\n", "").replace("'","").lower()
        links = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cur_tweets[i])
        for link in links:
            cur_tweets[i] = cur_tweets[i].replace(link, "")
        cur_tweets[i] = ''.join([k for k in cur_tweets[i] if not k.isdigit()])
        tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) 
        tokens = tokenizer.tokenize(cur_tweets[i])
#         print(tokens)
        for t in range(len(tokens)):
            tokens[t] = stemmer.stem_word(tokens[t])
        cur_tweets[i] = " ".join([w for w in tokens if w not in all_stopwords])
#         print("--", cur_tweets[i])
        # премєр-міністром баварії хорстом зеєхофером говорили збереження санкційного тиску москву

    return cur_tweets


In [5]:
def read_all_files(src):  # src - dict()
    docs, labels = [], []
    for k, v in src.items():
        with open("JSONs/" + v, 'r', encoding='utf-8') as tweets:
            docs.append(preprocess(tweets.read()[2:]))
            labels.append(k)
    return docs, labels

In [6]:
all_tweets, all_labels = read_all_files(categories)

print(len(all_tweets))

print(all_tweets[:3])

In [7]:
ind = 0
all_tr_tuples, all_te_tuples = [], []
for tweets in all_tweets:      # e.g. 1000 tweets in each label
    test_size = int(0.4 * len(tweets)) #  then 400 tweets for the test set
    train_size = len(tweets) - test_size   # 600 tweets for the train set
    tr_d = [tweets[i] for i in range(train_size)]  # 0-399
    te_d = [tweets[i] for i in range(train_size, train_size + test_size)]  # 400-999   
    
    for i in tr_d:
        all_tr_tuples.append((all_labels[ind], i)) 
    for i in te_d:
        all_te_tuples.append((all_labels[ind], i)) 
        
    ind += 1

In [8]:
shuffled_tr = all_tr_tuples
shuffled_te = all_te_tuples
random.shuffle(shuffled_tr)
random.shuffle(shuffled_te)

In [9]:
train_data, test_data = [], []
train_labels, test_labels = [], []
for i in shuffled_tr:
    train_data.append(i[1])
    train_labels.append(i[0])

for i in shuffled_te:
    test_data.append(i[1])
    test_labels.append(i[0])


In [10]:
vect = TfidfVectorizer(max_features=200, max_df=0.7, analyzer="word").fit(train_data)

In [11]:
train_set = [vect.transform([" ".join(text.split())]).toarray()[0] for text in train_data]

In [12]:
test_set = [vect.transform([" ".join(text.split())]).toarray()[0] for text in test_data]

### Multinomial Naive Bayes

In [13]:
clf_MNB = MultinomialNB()

In [14]:
clf_MNB.fit(train_set, train_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
clf_MNB.score(train_set, train_labels)

0.49512323020450971

In [16]:
pred_MNB_test = clf_MNB.predict(test_set)
score_MNB_test = accuracy_score(test_labels, pred_MNB_test)
print(pred_MNB_test, " --- ", score_MNB_test)

['Майкл Щур' 'Олег Ляшко' 'Майкл Щур' ..., 'Петро Порошенко'
 'Петро Порошенко' 'Петро Порошенко']  ---  0.430730478589


### SVM 

In [17]:
clf_svm = svm.SVC(C=4.0, kernel='linear')

In [18]:
clf_svm.fit(train_set, train_labels)

SVC(C=4.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [19]:
clf_svm.score(train_set, train_labels)

0.53256423702149969

In [20]:
pred_svm_test = clf_svm.predict(test_set)
score_svm_test = accuracy_score(test_labels, pred_svm_test)
print(pred_svm_test, " --- ", score_svm_test)

['Майкл Щур' 'Олег Ляшко' 'Майкл Щур' ..., 'Роман Скрипін' 'Роман Скрипін'
 'Роман Скрипін']  ---  0.454817380353


In [21]:
new_tweet = 'Видео \"Дохід УЄФА від проведення Евро 2012\" (http://t.co/dm9o03AT) на @YouTube загружено.'

In [22]:
new_tweet = preprocess(new_tweet)
print(new_tweet)

['виде " дохід уєф проведенн евр " ( загружен .']


In [23]:
vectorized = [vect.transform([" ".join(new_tweet)]).toarray()[0]]

In [24]:
clf_svm.predict(vectorized)

array(['Роман Скрипін'], 
      dtype='<U18')