In [1]:
from __future__ import unicode_literals

import json
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
from functools import reduce
from operator import add
from hazm import *
from utils.preprocessing import *
from utils.models import *
from copy import deepcopy


In [2]:
# Data root path
data_root = 'data'
fars_file = 'farsnews.jsonl'
asriran_file = 'asriran.jsonl'
# Dataset dataframe column names
keys = None

# News headline tags
not_valid_labels = None

# News agencies
news_agencies = None

# Importind Data

In [3]:
with open(os.path.join(data_root, asriran_file), encoding='utf-8') as jd:
    asriran = [json.loads(line) for line in jd]
    asriran = pd.DataFrame(asriran)
print('Number of Datapoints: {}'.format(len(asriran)))

Number of Datapoints: 15000


In [4]:
with open(os.path.join(data_root, fars_file), encoding='utf-8') as jd:
    fars = [json.loads(line) for line in jd]
    fars = pd.DataFrame(fars)
print('Number of Datapoints: {}'.format(len(fars)))

Number of Datapoints: 15000


# Preprocessing

Finding Valid Labels:

In [5]:
asr_labels = list(set(reduce(np.append, asriran.newsPathLinks.apply(lambda x: tuple(x.keys())))))
fars_labels = list(set(reduce(np.append, fars.newsPathLinks.apply(lambda x: list(x.keys())))))

In [6]:
set((list(asr_labels) + list(fars_labels)))

Some labels are not valid so:

In [14]:
not_valid_labels = [
     'دانلود',
     'ساير حوزه ها',
     'سایر حوزه ها',
     'دیگر رسانه ها',
     'نامشخص',
     'پیامک',
     'صفحه نخست',
     'عصرايران دو',
]
valid_labels = list(filter(lambda x: x not in not_valid_labels, list(set((list(asr_labels) + list(fars_labels))))))

Creating Documents & Labels:

In [15]:
asriran_tags = asriran['tags'].apply(lambda x: ' '.join(list(x.keys())))
fars_tags = fars['tags'].apply(lambda x: ' '.join(list(x.keys())))

In [16]:
title_count = 2
tag_count = 10
documents = np.append(asriran['body'] + ' ' + asriran['title'] * title_count + asriran_tags*tag_count,
                        fars['body'] + ' ' + fars['title'] * title_count + fars_tags*tag_count)
raw_labels = np.append(asriran.newsPathLinks.apply(lambda x: tuple(x.keys())),
                        fars.newsPathLinks.apply(lambda x: tuple(x.keys())))
org_labels = np.append( ['AsrIran'] * len(asriran), ['Fars'] * len(fars)) # For the third task

Removing Documents which are emtpy:

In [17]:
none_zero_docs = list(map(lambda x: len(x) > 1, documents))
documents = documents[none_zero_docs]
raw_labels = cleans_labels(raw_labels[none_zero_docs], valid_labels)
org_labels = org_labels[none_zero_docs]

Duplicating documents for each of their labels:

In [19]:
proc_documents, proc_labels = extend_labels(documents, raw_labels, valid_labels)

Normalizing & Tokenizing & Removing Stopwords Documents:

In [20]:
normalizer = Normalizer()
word_filter = WordFilter()
documents = list(pd.Series(documents).apply(normalizer.normalize).apply(tokenize).apply(word_filter.filter_words))
proc_documents = list(proc_documents.apply(normalizer.normalize).apply(tokenize).apply(word_filter.filter_words))

Replacing words with less than 2 occurances with unknown word

In [21]:
documents = make_unknown(documents)
proc_documents = make_unknown(proc_documents)

created
created


Making documents one hot encoded

In [22]:
label_set, proc_labels = one_hot_encoder(proc_labels)
label_set_th, org_labels = one_hot_encoder(org_labels)

Deviding document to train and test datasets:

In [23]:
x_train, y_train, x_test, y_test = train_test_split(proc_documents , proc_labels, train_size = 0.80, random_state=85)
x_train_th, y_train_th, x_test_th, y_test_th = train_test_split(documents , org_labels, train_size = 0.80, random_state=85)

# Creating Model for the first Task

Training:

In [24]:
nb = NaiveBayes()
nb.fit(x_train, y_train)

Vocab created
P(c) calculated
93
%0.0 continue...
%0.010752688172043012 continue...
%0.021505376344086023 continue...
%0.03225806451612903 continue...
%0.043010752688172046 continue...
%0.053763440860215055 continue...
%0.06451612903225806 continue...
%0.07526881720430108 continue...
%0.08602150537634409 continue...
%0.0967741935483871 continue...
%0.10752688172043011 continue...
%0.11827956989247312 continue...
%0.12903225806451613 continue...
%0.13978494623655913 continue...
%0.15053763440860216 continue...
%0.16129032258064516 continue...
%0.17204301075268819 continue...
%0.1827956989247312 continue...
%0.1935483870967742 continue...
%0.20430107526881722 continue...
%0.21505376344086022 continue...
%0.22580645161290322 continue...
%0.23655913978494625 continue...
%0.24731182795698925 continue...
%0.25806451612903225 continue...
%0.26881720430107525 continue...
%0.27956989247311825 continue...
%0.2903225806451613 continue...
%0.3010752688172043 continue...
%0.3118279569892473 continu

Train Evaluation:

In [25]:
nb.evaluate(x_train, y_train, label_set=label_set)

%0 continue...
%1000 continue...
%2000 continue...
%3000 continue...
%4000 continue...
%5000 continue...
%6000 continue...
%7000 continue...
%8000 continue...
%9000 continue...
%10000 continue...
%11000 continue...
%12000 continue...
%13000 continue...
%14000 continue...
%15000 continue...
%16000 continue...
%17000 continue...
%18000 continue...
%19000 continue...
%20000 continue...
%21000 continue...
%22000 continue...
%23000 continue...
%24000 continue...
%25000 continue...
%26000 continue...
%27000 continue...
%28000 continue...
%29000 continue...
%30000 continue...
%31000 continue...
Label مسئولیت های اجتماعی: 
     Precision: 0.4745762711864407
     Recall: 1.0
     F1-Measure: 0.6436781609195402
Label صنعت ، تجارت ، بازرگانی: 
     Precision: 0.5139664804469274
     Recall: 0.9787234042553191
     F1-Measure: 0.673992673992674
Label ایران در جهان: 
     Precision: 0.5211267605633803
     Recall: 0.9024390243902439
     F1-Measure: 0.6607142857142856
Label شهری: 
     Precision: 0

  percision = confusion_matrix.diagonal()/np.sum(confusion_matrix, axis=1)


Test Evaluation:

In [26]:
nb.evaluate(x_test, y_test, label_set=label_set)

%0 continue...
%1000 continue...
%2000 continue...
%3000 continue...
%4000 continue...
%5000 continue...
%6000 continue...
%7000 continue...
Label مسئولیت های اجتماعی: 
     Precision: 0.5384615384615384
     Recall: 1.0
     F1-Measure: 0.7000000000000001
Label صنعت ، تجارت ، بازرگانی: 
     Precision: 0.358974358974359
     Recall: 0.7777777777777778
     F1-Measure: 0.49122807017543857
Label ایران در جهان: 
     Precision: 0.38461538461538464
     Recall: 0.5555555555555556
     F1-Measure: 0.4545454545454546
Label شهری: 
     Precision: 0.46551724137931033
     Recall: 0.8852459016393442
     F1-Measure: 0.6101694915254237
Label غرب از نگاه غرب: 
     Precision: 1.0
     Recall: 1.0
     F1-Measure: 1.0
Label خانواده: 
     Precision: 1.0
     Recall: 0.3333333333333333
     F1-Measure: 0.5
Label تور و توپ: 
     Precision: 0.4186046511627907
     Recall: 0.6
     F1-Measure: 0.49315068493150693
Label فوتبال ایران: 
     Precision: 0.4357429718875502
     Recall: 0.7331081081081081

  percision = confusion_matrix.diagonal()/np.sum(confusion_matrix, axis=1)
  recall = confusion_matrix.diagonal()/np.sum(confusion_matrix, axis=0)
  f1_measure = 2*percision*recall/(percision+recall)


# Creating Model for the twoTask

In [50]:
t = []
for i, raw_label in enumerate(raw_labels):
    l = []
    for j, label in enumerate(raw_label):
        l.append(np.argmax(label == label_set))
    t.append(l)

In [28]:
nb.evaluate(documents, t, label_set, eval_type='multiple')

%0 continue...
%1000 continue...
%2000 continue...
%3000 continue...
%4000 continue...
%5000 continue...
%6000 continue...
%7000 continue...
%8000 continue...
%9000 continue...
%10000 continue...
%11000 continue...
%12000 continue...
%13000 continue...
%14000 continue...
%15000 continue...
%16000 continue...
%17000 continue...
%18000 continue...
%19000 continue...
%20000 continue...
%21000 continue...
%22000 continue...
%23000 continue...
%24000 continue...
%25000 continue...
%26000 continue...
%27000 continue...
%28000 continue...
%29000 continue...
Total Score: -88182


# Creating Model for the third Task

Training:

In [29]:
nb_th = NaiveBayes()
nb_th.fit(x_train_th, y_train_th)

Vocab created
P(c) calculated
2
%0.0 continue...
%0.5 continue...
P(w|c) calculated


Train Evaluation:

In [30]:
nb_th.evaluate(x_train_th, y_train_th, label_set_th)

%0 continue...
%1000 continue...
%2000 continue...
%3000 continue...
%4000 continue...
%5000 continue...
%6000 continue...
%7000 continue...
%8000 continue...
%9000 continue...
%10000 continue...
%11000 continue...
%12000 continue...
%13000 continue...
%14000 continue...
%15000 continue...
%16000 continue...
%17000 continue...
%18000 continue...
%19000 continue...
%20000 continue...
%21000 continue...
%22000 continue...
%23000 continue...
Label AsrIran: 
     Precision: 0.965974765974766
     Recall: 0.9862865691489362
     F1-Measure: 0.9760250030842621
Label Fars: 
     Precision: 0.9853072128227961
     Recall: 0.9635983627971785
     F1-Measure: 0.9743318804209042
Total Accuracy: 0.9752073144801191


Test Evaluation:

In [31]:
nb_th.evaluate(x_test_th, y_test_th, label_set_th)

%0 continue...
%1000 continue...
%2000 continue...
%3000 continue...
%4000 continue...
%5000 continue...
Label AsrIran: 
     Precision: 0.9382040553588671
     Recall: 0.9821428571428571
     F1-Measure: 0.959670781893004
Label Fars: 
     Precision: 0.9808802308802309
     Recall: 0.9340432840948127
     F1-Measure: 0.9568889670948443
Total Accuracy: 0.9583262459601973
