In [1]:
import re
import math
from collections import defaultdict
from operator import itemgetter

In [2]:
regex_string = re.compile(r"(?:(?:[^а-яА-Я]+')|(?:'[^а-яА-Я]+))|(?:[^а-яА-Я']+)")
words_number_leave = 50000
stop_words = []

In [3]:
with open('stop_words.txt', 'r', encoding='UTF-8') as f:
    stop_words = f.read().splitlines()
stop_words.append("")

In [4]:
words_per_article = [defaultdict(int) for i in range(60000)]
all_words = set()
themes = []
with open("news_train.txt", "r", encoding="utf-8") as train_text:
    for index, article in enumerate(train_text):
        split = article.split('\t')
        theme = split[0]
        themes.append(theme)
        text = (split[1] + " " + split[2]).lower()
        words = regex_string.split(text)
        for word in words:
            if word not in stop_words:
                words_per_article[index][word] += 1
                all_words.add(word)

In [5]:
words_per_article[10]

defaultdict(int,
            {'предложит': 1,
             'оон': 4,
             'способ': 1,
             'защиты': 2,
             'интернет': 3,
             'революций': 2,
             'совет': 1,
             'безопасности': 3,
             'рф': 2,
             'совместно': 1,
             'мидом': 1,
             'разработал': 1,
             'проект': 3,
             'конвенции': 4,
             'призванной': 1,
             'оградить': 1,
             'государства': 5,
             'вмешательства': 2,
             'киберпространство': 1,
             'извне': 1,
             'документа': 3,
             'оказался': 1,
             'распоряжении': 1,
             'газеты': 1,
             'коммерсантъ': 1,
             'документ': 1,
             'обеспечении': 1,
             'международной': 1,
             'информационной': 1,
             'представлен': 1,
             'закрытой': 1,
             'встрече': 1,
             'руководителей': 1,
             'спецслужб': 1,


In [6]:
from sys import getsizeof
getsizeof(all_words)

8388832

In [7]:
def computeTF(wordDict, lenWords):
    tfDict = defaultdict(float)
    for word, count in wordDict.items():
        tfDict[word] = count/lenWords
    return tfDict

In [8]:
wordsLen = len(all_words)
tf_per_article = [defaultdict(float) for i in range(60000)]
for index, wordsDict in enumerate(words_per_article):
    tf_per_article[index] = computeTF(wordsDict, float(wordsLen))

In [9]:
def computeIDF(docList, all_words, N):
    idfDict = dict.fromkeys(all_words, 0)
    for doc in docList:
        for word, val in doc.items():
            idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

In [10]:
idfs = computeIDF(words_per_article, all_words, float(60000))

In [11]:
def computeTFIDF(articleTF, idfs):
    tfidf = defaultdict(float)
    for word, val in articleTF.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [12]:
tf_idf_per_article = [defaultdict(float) for i in range(60000)]
for index, articleTF in enumerate(tf_per_article):
    tf_idf_per_article[index] = computeTFIDF(articleTF, idfs)

In [13]:
%load_ext autoreload
%autoreload 2

In [38]:
from knn import KNeighborsClassifier


In [15]:
%connect_info


{
  "shell_port": 57561,
  "iopub_port": 57562,
  "stdin_port": 57563,
  "control_port": 57564,
  "hb_port": 57565,
  "ip": "127.0.0.1",
  "key": "6cea8e45-6d02e1e5facd22da295ff9dd",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-80818862-235d-4ef5-8d8c-73b24c809089.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


In [16]:
import sys
sys.executable

'C:\\Users\\i503708\\AppData\\Local\\Continuum\\anaconda3\\envs\\dobrynin\\python.exe'

In [49]:
def train_test_split(X, y):
    return X[:59900], X[59900:], y[:59900], y[59900:]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_per_article, themes)

In [51]:

# unit tests
knn = KNeighborsClassifier(all_words)


In [52]:
knn.fit(X_train, y_train)

In [53]:
prediction = knn.test(X_test, y_test)

1.6644935607910156
1.6735689640045166
1.6904408931732178
1.6247024536132812
1.3964598178863525
1.507927417755127
1.8460643291473389
1.9777147769927979
2.006636619567871
2.710754871368408
1.7991905212402344
1.5050444602966309
1.4551122188568115
1.6047091484069824
2.3158226013183594
2.6558897495269775
1.7253875732421875
1.2007899284362793
1.669539451599121
1.6834993362426758
1.5861809253692627
1.8300530910491943
1.5967755317687988
1.6884422302246094
1.9178738594055176
1.729377031326294
2.0674726963043213
1.4233348369598389
1.6595635414123535
1.7084791660308838
1.2615830898284912
1.8271148204803467
1.8939380645751953
1.8271598815917969
2.113402843475342
1.439129114151001
2.889277696609497
1.889948844909668
1.7034873962402344
1.6555759906768799
1.7722232341766357
1.8281128406524658
2.7307002544403076
1.5339021682739258
1.7503678798675537
1.8889408111572266
1.9228613376617432
1.9129064083099365
1.7962408065795898
1.7253944873809814
1.451160192489624
1.95988130569458
1.5927410125732422
1.664

0.17

In [24]:
y_test[:10]

['sport',
 'science',
 'economics',
 'media',
 'media',
 'culture',
 'economics',
 'economics',
 'life',
 'life']

In [None]:
sort_by_freq = dict(list(sorted(all_words.items(), key=lambda x: x[1], reverse=True))[:words_number_leave])

In [None]:
word_index = dict((k,i) for i,k in enumerate(sort_by_freq))

In [None]:
count_matrix = [[[0]*len(sort_by_freq)] for _ in range(number_of_rows)]

In [None]:
with open("news_train.txt", "r", encoding="utf-8") as train_text:
    for ith, article in enumerate(train_text):
        split = article.split('\t')
        text = (split[1] + " " + split[2]).lower()
        words = regex_string.split(text)
        for word in words:
            if word in sort_by_freq:
                count_matrix[ith][word_index[word]] += 1

In [None]:
count_matrix[:][0]

In [None]:
{k: all_words[k] for v, k in list())[:100]}

In [None]:
themes_enc = dict((k,i) for i,k in enumerate(set(themes)))

In [None]:
themes_transformed = [themes_enc[theme] for theme in themes]

In [None]:
themes_transformed[:5]

In [None]:
columns = dict((k,i) for i,k in enumerate(all_words))

In [None]:
def get_most_common_item(array):
    count_dict = defaultdict(int)
    for key in array:
        count_dict[key] += 1
    key, count = max(count_dict.items(), key=itemgetter(1))
    return key

In [None]:
def dot(A,B): 
    return (sum(a*b for a,b in zip(A,B)))

In [None]:
def cosine_similarity(a,b):
    return dot(a,b) / ( (dot(a,a) **.5) * (dot(b,b) ** .5) )

In [None]:
def euclidean_dist(words_train, words_test): 
    #return len(words_train) + len(words_test) - 2 * len(words_train & words_test)
    train_row = [0] * len(columns)
    print("RAZ")
    for word_train in words_train:
        train_row[columns[word_train]] = 1
    print("DVA")
    test_row = [0] * len(columns)
    for word_test in words_test:
        test_row[columns[word_test]] = 1
    print("TRI!")
    return cosine_similarity(train_row, test_row)#math.sqrt(sum([(train_row[i]-test_row[i])**2 for i, _ in enumerate(train_row)]) )

In [None]:
def knn(k=1):
    y_test = []
    with open("news_test.txt", "r", encoding="utf-8") as test_text:
        for test_article in test_text:
            split = article.split('\t')
            text = split[1] + " " + split[2]
            test_words = set(re.split("(\w[\w']*\w|\w)", text))
            test_words &= all_words
            eucl_dist = [euclidean_dist(train_words, test_words) for train_words in words_per_article]
            sorted_eucl_dist = sorted(eucl_dist)
            closest_knn = [eucl_dist.index(sorted_eucl_dist[i]) for i in range(0, k)] if k > 1 else [eucl_dist.index(min(eucl_dist))]
            closest_labels_knn = [themes_transformed[x] for x in closest_knn]
            y_test.append(get_most_common_item(closest_labels_knn))
            print(get_most_common_item(closest_labels_knn))
        
    return y_test

In [None]:
knn(k=100)