# NLP FIRST ASSIGNMENT
Developing a Naïve Bayas Classifier able to distinguish between english and not-english

## Preliminary imports

In [26]:
import nltk
import random
import time
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

ROOT = '/Users/sebastianodarconso/Desktop/magistrale_lab/natural_language_processing/first_assingnment/europarl_raw'

## Extracting english and not-english files from the europarl_raw dataset

In [27]:
eng = open(ROOT + "/english/ep-00-01-17.en", 'r').read()
eng += open(ROOT + "/english/ep-00-01-18.en", 'r').read()
eng += open(ROOT + "/english/ep-00-01-19.en", 'r').read()
eng += open(ROOT + "/english/ep-00-01-21.en", 'r').read()
eng += open(ROOT + "/english/ep-00-02-02.en", 'r').read()
eng += open(ROOT + "/english/ep-00-02-03.en", 'r').read()

not_eng = open(ROOT + "/german/ep-00-01-17.de", 'r').read()
not_eng += open(ROOT + "/french/ep-00-01-17.fr", 'r').read()
not_eng += open(ROOT + "/finnish/ep-00-01-17.fi", 'r').read()
not_eng += open(ROOT + "/greek/ep-00-01-17.el", 'r').read()
not_eng += open(ROOT + "/italian/ep-00-01-17.it", 'r').read()
not_eng += open(ROOT + "/swedish/ep-00-01-17.sv", 'r').read()
not_eng += open(ROOT + "/dutch/ep-00-01-18.nl", 'r').read()

In [28]:
print("Number of words in the english documents: {}".format(len(eng)))
print("Number of words in the not english documents: {}".format(len(not_eng)))

Number of words in the english documents: 1400752
Number of words in the not english documents: 1503692


# Creating the sets of stopwords for the languages used 
For this example the languages are:
- english
- german 
- dutch
- finnish
- italian
- swedish 
- french 
- greek 

They will still be divided as "english" and "not english"

In [29]:
languages = ['english', 'german', 'dutch', 'finnish', 'italian', 'swedish', 'french', 'greek']

In [30]:
stopwords_eng = set()
stopwords_not_eng = set()
for l in languages:
    if l == 'english':
        stopwords_eng.update(stopwords.words(l))
    else:
        stopwords_not_eng.update(stopwords.words(l))

## Creating the stemmer (PorterStemmer)

In [31]:
ps = PorterStemmer()

## Tokenizing, stemming and removing stopwords

In this section all the words in both english and not english bows will be tokenized and stemmed. From the resulting bows will be removed also the stopwords and then they will be merged together in order to create an heterogeneous (with relation to the language) bag of words.

In [32]:
all_words = []
documents = []

start = time.time()
for en in tqdm(eng.split('\n')):
    documents.append((en, 'eng'))
    words = word_tokenize(en)
    for w in words:
        if not w in stopwords_eng:
            all_words.append(ps.stem(w.lower()))

for ne in tqdm(not_eng.split('\n')):
    documents.append((ne, 'not eng'))
    words = word_tokenize(ne)
    for w in words:
        if w not in stopwords_not_eng:
            all_words.append(ps.stem(w.lower()))
end = time.time()

print(end- start)

100%|██████████| 9135/9135 [00:01<00:00, 4906.19it/s]
100%|██████████| 9637/9637 [00:01<00:00, 5022.50it/s]

3.7865872383117676





## Calculating the frequency distribution for the BOW and listing the first 8k features

In [33]:
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:8000]

## Defining a function that extract the features from each document (all the words, tokenized)

In [34]:
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features 

## Creating the dataset and shuffling it

In [35]:
featureset = [(find_features(doc), lang) for (doc, lang) in tqdm(documents)]
random.shuffle(featureset)
print(len(featureset))

100%|██████████| 18772/18772 [00:32<00:00, 577.50it/s]

18772





## Splitting (evenly) the dataset into training and testing data

In [36]:
dataset_len = len(featureset)

testing_set = featureset[(dataset_len//2):]
training_set = featureset[:(dataset_len//2)]

## Training the default Naïve Bayas Classifier on the training dataset

In [37]:
start = time.time()
classifier = nltk.NaiveBayesClassifier.train(training_set)
end = time.time()

print(end - start)

31.555356979370117


## Defining the function that will be used to test the classifier on text and files different from the testing ones

In [38]:
def test(text):
    feats = find_features(text)
    return classifier.classify(feats)

## Some tests on different languages text

In [39]:
text = "oggi è proprio una bella giornata"
text_eng = "today is such a beautiful day"
text_de = "Mein Name ist Anna. Ich komme aus Österreich und lebe seit drei Jahren in Deutschland. Ich bin 15Jahre alt und habe zwei Geschwister: Meine Schwester heißt Klara und ist 13 Jahre alt, mein BruderMichael ist 18 Jahre alt."
text_el = "Το όνομά μου είναι Άννα. Κατάγομαι από την Αυστρία και ζω στη Γερμανία εδώ και τρία χρόνια. Είμαι 15 χρονών και έχω δύο αδέρφια: Η αδερφή μου ονομάζεται Κλάρα και είναι 13 ετών, ο αδερφός μου ο Μιχαήλ είναι 18 ετών. Ζούμε με τους γονείς μας σε ένα σπίτι κοντά στο Μόναχο, η μητέρα μου είναι μαγείρισσα και ο πατέρας μου σε τράπεζα"
print(text + ': ' + test(text) + '\n')
print(text_eng + ': ' + test(text_eng) + '\n')
print(text_de + ': '+ test(text_de) + '\n')
print(text_el + ': '+ test(text_el) + '\n')

oggi è proprio una bella giornata: not eng

today is such a beautiful day: eng

Mein Name ist Anna. Ich komme aus Österreich und lebe seit drei Jahren in Deutschland. Ich bin 15Jahre alt und habe zwei Geschwister: Meine Schwester heißt Klara und ist 13 Jahre alt, mein BruderMichael ist 18 Jahre alt.: not eng

Το όνομά μου είναι Άννα. Κατάγομαι από την Αυστρία και ζω στη Γερμανία εδώ και τρία χρόνια. Είμαι 15 χρονών και έχω δύο αδέρφια: Η αδερφή μου ονομάζεται Κλάρα και είναι 13 ετών, ο αδερφός μου ο Μιχαήλ είναι 18 ετών. Ζούμε με τους γονείς μας σε ένα σπίτι κοντά στο Μόναχο, η μητέρα μου είναι μαγείρισσα και ο πατέρας μου σε τράπεζα: not eng



## Some tests on documents

In [40]:
file_eng = open("/Users/sebastianodarconso/Desktop/magistrale_lab/natural_language_processing/first_assingnment/english_doc.txt").read()
file_ita = open("/Users/sebastianodarconso/Desktop/magistrale_lab/natural_language_processing/first_assingnment/italian_doc.txt").read()

print("file in english: {}".format(test(file_eng)))
print("file in italian: {}".format(test(file_ita)))

file in english: eng
file in italian: not eng


## Displaying the confusion matrix with the testing set
The way to read the confusion matrix is that from all the 9386 samples in the test set, there were 4391 true positive and 4820 true negative cases. However it also predicted 175 false negative and zero false positive.

In [41]:
from nltk.metrics import ConfusionMatrix
from collections import defaultdict

ref = defaultdict(set)
testset = defaultdict(set)

labels = []
tests = []

for i, (feats, label) in enumerate(tqdm(testing_set)):
    ref[label].add(i)
    observed = classifier.classify(feats)
    testset[observed].add(i)
    labels.append(label)
    tests.append(observed)


cm = ConfusionMatrix(labels, tests)
print(cm)

100%|██████████| 9386/9386 [01:18<00:00, 119.52it/s]

        |         n |
        |         o |
        |         t |
        |           |
        |    e    e |
        |    n    n |
        |    g    g |
--------+-----------+
    eng |<4388> 172 |
not eng |    .<4826>|
--------+-----------+
(row = reference; col = test)






## Displaying the precision, recall and F-measure on the testing data
In this case I used the built-in functions of nltk but we can see the next outcomes as:
- Precision = $\frac{truePositive}{truePositive + falsePositive}$

- Recall = $\frac{truePositive}{truePositive + falseNegative}$

- F = $2 * \frac{precison * recall}{precision + recall}$

In [42]:
print(cm.evaluate())

    Tag | Prec.  | Recall | F-measure
--------+--------+--------+-----------
    eng | 1.0000 | 0.9623 | 0.9808
not eng | 0.9656 | 1.0000 | 0.9825



## Classifier accuracy and most informative features
The accuracy is obtained in the next way:
- Accuracy = $\frac{truePositive + trueNegative}{truePositive + trueNegative + falsePositive + falseNegative}$

In [43]:
classifier.show_most_informative_features(15)

Most Informative Features
                     the = True              eng : not en =   2386.0 : 1.0
                     and = True              eng : not en =    847.4 : 1.0
                      en = True           not en : eng    =    674.2 : 1.0
                      be = True              eng : not en =    419.2 : 1.0
                     die = True           not en : eng    =    376.9 : 1.0
                    will = True              eng : not en =    248.8 : 1.0
                      de = True           not en : eng    =    233.2 : 1.0
                       l = True           not en : eng    =    177.8 : 1.0
                      or = True              eng : not en =    158.1 : 1.0
                     den = True           not en : eng    =    144.9 : 1.0
                     but = True              eng : not en =    135.2 : 1.0
                      le = True           not en : eng    =    109.7 : 1.0
                      et = True           not en : eng    =     79.7 : 1.0

In [44]:
accuracy = nltk.classify.accuracy(classifier, training_set)
print("accuracy: {}".format(accuracy))

accuracy: 0.9831664180694651


# Employability as a probabilistic language model
Naïve Bayes relies on the Bayes algorithm and essentially assigns a class label to the sample based on the conditional probability class given by the features. 
This kind of classifier assumes that each feature makes an independent and equal contribution to the target class.
This works quite well in this example since we basically want the classifier to classify sentences/words/documents in just two classes: English and not english.
The English class can be seen as a Unigram model, therefore a probabilistic language model.
It may be not suitable in the case of multi-class classification but in this simple case it works fine.