# Imports
We import the packages that include corpora, tokenizer, lemmatizer and stemmer

In [30]:
import random
import math
import collections
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import EuroparlCorpusReader

# Import stopwords, punctuations and the first 10000 number in order to remove them from the BOW

They don't have informative meaning for our purpose

In [3]:
languages = ['english', 'italian', 'spanish', 'dutch', 'german', 'portuguese', 'finnish', 'swedish', 'greek']
numbers = []
for number in range(10000):
    numbers.append(str(number))
stop_words = list(nltk.corpus.stopwords.words(lang) for lang in languages)
punctuation = string.punctuation


Initialize stemmer and lemmatizer

In [4]:

root = '/Users/tommasodelprete/nltk_data/corpora/europarl_raw/'

english = EuroparlCorpusReader(root, '.*\.en')
documents = list()
for fileid in english.fileids():
    for sentence in english.raw(fileid).split('\n'):
        documents.append((sentence, 'eng'))

print(len(documents))
prefToLang = {'da':'danish', 'nl':'dutch', 'fi':'finnish', 'de':'german', 'fr':'french', 'it':'italian', 'pt':'portuguese', 'el':'greek', 'es':'spanish', 'sv':'swedish'}

for lang in prefToLang.keys():
    non_english = EuroparlCorpusReader(root, ".*\.{}".format(lang))
    for sentence in non_english.raw(f'{prefToLang[lang]}/ep-00-01-17.{lang}').split('\n'):
        documents.append((sentence, 'noteng'))

print(len(documents))
random.shuffle(documents)

19990
29109


In [5]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

Fetch the corpus I'll use as training set

In [6]:
words = list()
for document in documents:
    for word in word_tokenize(document[0]):
        words.append(word)

words_without_sw = []

# Stemming, lemmatization and removing stopwords, punctuation and numbers from the words we imported

In [7]:
for word in words:
    if word not in stop_words and word not in punctuation and word not in numbers:
        # print(f"{word} : {lemmatizer.lemmatize(word)} : {stemmer.stem(lemmatizer.lemmatize(word))}")
        words_without_sw.append(lemmatizer.lemmatize(stemmer.stem(word)))


# We compute the probability of the words in our corpora, then store the first most frequent 5000 in a list

In [8]:
freq = nltk.probability.FreqDist(words_without_sw)# BOW
word_feature = list(freq)[:5000]

print(word_feature)

['the', 'of', 'to', 'a', 'and', 'in', 'is', 'that', 'de', 'i', 'thi', 'for', 'we', 'it', 'on', 'be', 'which', 'are', 'not', 'have', 'with', 'la', 'will', 'european', 'by', 'ha', 'at', 'commiss', 'mr', 'en', 'que', 'all', 'an', 'would', 's', 'do', 'e', 'also', 'but', 'union', 'presid', 'should', 'le', 'state', 'can', 'no', 'must', 'our', 'there', 'you', 'parliament', 'from', 'been', 'member', 'these', 'polici', 'die', 'der', 'or', 'more', 'wa', 'like', 'region', 'what', 'council', 'one', 'van', 'propos', 'countri', 'report', 'need', 'develop', 'import', 'o', 'europ', 'if', 'so', 'right', 'da', 'veri', 'social', 'they', 'den', 'their', 'other', 'det', 'het', 'commun', 'my', 'make', 'di', 'issu', 'l', 'take', 'se', 'programm', 'u', 'area', 'concern', 'att', 'new', 'et', 'work', 'και', 'el', 'about', 'onli', 'question', 'commission', 'un', 'time', 'point', 'να', 'lo', 'om', 'peopl', 'som', 'therefor', 'who', 'y', 'amend', 'year', 'am', 'og', 'group', 'och', 'debat', 'direct', 'ja', 'becaus

# We define the function that will extract out features

In [11]:
def document_features(document):
    document_words = word_tokenize(document)
    features = {}
    for word in word_feature:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [12]:
featuresets = [(document_features(d), c) for (d,c) in documents]

In [13]:
print(len(featuresets))
train_set, test_set = featuresets[math.floor(len(featuresets)/2):], featuresets[:math.floor(len(featuresets)/2)]
classifier = nltk.NaiveBayesClassifier.train(train_set)

29109


In [49]:
print(classifier.classify(document_features("mauro")))

noteng


In [None]:
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(test_set):
    print(i)
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)


# Display stats

- Precision: The number of labeled positive / real positive
- Recall: The number of labeled negative / real negative
- F-Measure:

In [44]:

print( 'Precision:', nltk.precision(refsets['eng'], testsets['eng']) )
print( 'Recall:', nltk.recall(refsets['eng'], testsets['eng']) )
print( 'F-Measure:', nltk.f_measure(refsets['eng'], testsets['eng']) )
# `'pos'` is for the "positive" (as opposed to "negative") label

Precision: 1.0
Recall: 0.9923855325117724
F-Measure: 0.9961782158302324
