# Cerinta 1 - Reprezentarea modelului de clasificare

In [1]:
from functools import reduce
from json import dumps
from math import inf, log
from operator import add
from os import listdir
from random import shuffle

from nltk import download
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spacy import load

download('wordnet')
download('stopwords')

[nltk_data] Downloading package wordnet to /home/teo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/teo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
class NewsClassificationModel:
	def __init__(self, stop_words_file):
		self._ALPHA = 1
		self._chars_to_remove = '012345679.,-~`|\\/:;\'"?![]()\n\"'
		self.class_words = {}
		self.class_app = {}
		self.total_files = 0
		self.stop_pos = {'PART', 'DET', 'PUNCT', 'AUX'}

		self.wnl = WordNetLemmatizer()
		self.nlp = load('en')

		self.stop_words = set(stopwords.words('english')) | set(' ')
		with open(stop_words_file) as f:
			self.stop_words.union(set(token.replace('\n', '') for token in f))


	def _remove_chars(self, s):
		for ch in self._chars_to_remove:
			s = s.replace(ch, '')

		return s


	def _parse_file(self, file):
		words = []
		# TODO: fa mai functional
		with open(file, encoding='utf8') as f:
			for tok in self.nlp(f.read().replace('\n', ' ')):
				if (tok.lemma_ not in self.stop_words
					and tok.pos_ not in self.stop_pos
					and not tok.is_stop
				):
					stripped_lem = self._remove_chars(tok.lemma_).lower()
					if stripped_lem:
						words.append(self.wnl.lemmatize(stripped_lem))

		return filter(lambda l: l not in self.stop_words, words)


	def add_file(self, file, clss):
		self.total_files += 1

		if clss in self.class_app:
			self.class_app[clss] += 1
		else:
			self.class_app[clss] = 1
			self.class_words[clss] = {}

		for lem in self._parse_file(file):
			if lem in self.class_words[clss]:
				self.class_words[clss][lem] += 1
			else:
				self.class_words[clss][lem] = 1


	def _compute_log_prob(self, word, cl):
		return log(
			(self.class_words[cl].get(word, 0) + self._ALPHA)
			/ (len(self.class_words[cl]) + len(self.class_words) * self._ALPHA)
		)


	def run_inference(self, file):
		max_prob = -inf
		lems = list(self._parse_file(file))  # TODO: incearca fara list()

		for cl in self.class_words:
			cl_prob = reduce(
				add,
				map(lambda l: self._compute_log_prob(l, cl), lems),
				0
			) + log(self.class_app[cl] / self.total_files)

			if max_prob < cl_prob:
				max_prob = cl_prob
				pred = cl
		
		return pred

	def print(self, file):
		with open(file, 'w') as f:
			f.write("class_app:\n")
			f.write(dumps(self.class_app, indent=4) + '\n')
			f.write("class_words:\n")
			f.write(dumps(self.class_words, indent=4) + '\n')

In [3]:
def create_classification_model(path, stop_words_file):
    model = NewsClassificationModel(stop_words_file)
    test_files = {}

    for clss in listdir(path):
        files_in_class = listdir(f'{path}/{clss}')
        shuffle(files_in_class)
        
        num_files = len(files_in_class)

        test_files[clss] = files_in_class[:int(num_files / 4)]
        train_files = files_in_class[int(num_files / 4):]

        [model.add_file(f'{path}/{clss}/{f}', clss) for f in train_files]
    
    return model, test_files

In [4]:
def test_classification(path, stop_words_file):
    model, test_files = create_classification_model(path, stop_words_file)
    total_preds = {clss: 0 for clss in test_files}
    correct_preds = {}
    recalls = {}

    model.print('model_spacey')

    for cl, files in test_files.items():
        preds = list(map(lambda f: model.run_inference(f'{path}/{cl}/{f}'), files))
        correct_preds[cl] = len(list(filter(lambda c: c == cl, preds)))
        recalls[cl] = correct_preds[cl] / len(files)

        # TODO: cauta cum sa faci functional
        for pred in preds:
            total_preds[pred] += 1
    
    for clss in test_files:
        print(f'====== {clss} ======')
        print(f'Recall = {recalls[clss]}')
        print(f'Precision = {correct_preds[clss] / total_preds[clss]}\n')

In [5]:
test_classification('BBC News Summary/News Articles', 'stop_words')

Recall = 0.9291338582677166
Precision = 0.9915966386554622

Recall = 0.8125
Precision = 1.0

Recall = 0.9903846153846154
Precision = 0.8956521739130435

Recall = 1.0
Precision = 1.0

Recall = 1.0
Precision = 0.8695652173913043



# Cele mai bune rezultate - Cerinta 2
```
====== business ======
Recall = 0.952755905511811
Precision = 0.9918032786885246

====== entertainment ======
Recall = 0.8541666666666666
Precision = 1.0

====== politics ======
Recall = 0.9807692307692307
Precision = 0.9272727272727272

====== sport ======
Recall = 0.9921259842519685
Precision = 1.0

====== tech ======
Recall = 1.0
Precision = 0.8771929824561403
```