# Imports
We import the packages that include corpora, tokenizer, lemmatizer and stemmer.

In [1]:
import random
import math
import collections
import nltk
import string
from nltk.metrics import ConfusionMatrix
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import EuroparlCorpusReader

# Import stopwords, punctuations and the first 10000 number in order to remove them from the BOW

We remove them because they don't have any informative meaning for our purposes.

In [2]:
languages = ['english', 'italian', 'spanish', 'dutch', 'german', 'portuguese', 'finnish', 'swedish', 'greek']
numbers = []
for number in range(10000):
    numbers.append(str(number))
stop_words = list(nltk.corpus.stopwords.words(lang) for lang in languages)
punctuation = string.punctuation


# Import the corpora we're using to train and to test our classifier

In particular, I import corpora from nltk.corpus.EuroParl, a collection that contains extracts from public speeches at the european parliament, in different languages.

Each corpus is divided into paragraphs, split by the character '\n', and each paragraph is labeled as 'eng' or 'noteng'.

Then I shuffle all the documents I stored.



In [3]:
# Be sure you set the right directory

root = 'data/europarl_raw/'

english = EuroparlCorpusReader(root, '.*\.en')
documents = list()
for fileid in english.fileids():
    for sentence in english.raw(fileid).split('\n'):
        documents.append((sentence, 'eng'))

prefToLang = {'da':'danish', 'nl':'dutch', 'fi':'finnish', 'de':'german', 'fr':'french', 'it':'italian', 'pt':'portuguese', 'el':'greek', 'es':'spanish', 'sv':'swedish'}

for lang in prefToLang.keys():
    non_english = EuroparlCorpusReader(root, ".*\.{}".format(lang))
    for sentence in non_english.raw(f'{prefToLang[lang]}/ep-00-01-17.{lang}').split('\n'):
        documents.append((sentence, 'noteng'))

random.shuffle(documents)

19990
29109


# Import the stemmer and the lemmatizer

In [4]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Tokenization
I tokenize the paragraphs stored into 'documents'.

In [5]:
words = list()
for document in documents:
    for word in word_tokenize(document[0]):
        words.append(word)


# Stemming, lemmatizating and removing stopwords, punctuation and numbers from the words we obtained by tokenization

In [6]:
words_without_sw = []

for word in words:
    if word not in stop_words and word not in punctuation and word not in numbers:
        # print(f"{word} : {lemmatizer.lemmatize(word)} : {stemmer.stem(lemmatizer.lemmatize(word))}")
        words_without_sw.append(lemmatizer.lemmatize(stemmer.stem(word)))


# Bag of Word
We compute the probability of the words in our corpora, then store the first most frequent 5000 in a list.

In [21]:
freq = nltk.probability.FreqDist(words_without_sw)# BOW
word_feature = list(freq)[:5000]

# We define the function that will extract out features
For each document, the function returns a dictionary that verifies which words, from the document we pass in input, are contained into the BOW.

In [8]:
def document_features(document):
    document_words = word_tokenize(document)
    features = {}
    for word in word_feature:
        features['contains({})'.format(word)] = (word in document_words)
    return features

# Define the featureset
A list of tuples, that associates the label 'eng' or 'noteng' to each paragraph of the corpora we imported previously.

In [9]:
featuresets = [(document_features(d), c) for (d,c) in documents]

# I initiate the Naive Bayes Classifier
- Train set: first half of the featureset
- Test set: second half of the featureset

In [10]:

train_set, test_set = featuresets[math.floor(len(featuresets)/2):], featuresets[:math.floor(len(featuresets)/2)]
classifier = nltk.NaiveBayesClassifier.train(train_set)

29109


# Testing the classifier on new sentences

In [11]:
print(classifier.classify(document_features(open('prova.txt').read())))

noteng


In [18]:
refMat = list()
testMat = list()
for i, (feats, label) in enumerate(test_set):
    refMat.append('eng') if label == 'eng' else refMat.append('noteng')
    observed = classifier.classify(feats)
    testMat.append('eng') if observed == 'eng' else testMat.append('noteng')

# Display stats

- **Precision**: The number of true positives / true positives + false positives
- **Recall**: The number of trie positives / true positives + false negatives
- **F-Measure**: 2 * Precision * Recall / Precision + Recall


In [20]:
cm = ConfusionMatrix(refMat, testMat)
print(cm.evaluate())

   Tag | Prec.  | Recall | F-measure
-------+--------+--------+-----------
   eng | 1.0000 | 0.9919 | 0.9959
noteng | 0.9824 | 1.0000 | 0.9911

Precision: 1.0
Recall: 0.9919266420811322
F-Measure: 0.995946960220165
