## Assignment 2

In [373]:
import numpy as np
import pandas as pd
from collections import Counter
import functools
from sklearn.metrics import f1_score


In [119]:
# Reading the document
def read_documents(doc_file) -> tuple:
    docs = []
    labels = []
    with open(doc_file, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            docs.append(words[3:])
            labels.append(words[1])
    return (docs, labels)

In [172]:
# Dividing into training data and validation data
(all_docs, all_labels) = read_documents('reviews.txt')

# 80% of the data is used for training and 20% for validation.
split_point = int(0.80*len(all_docs))
train_docs = all_docs[:split_point]
train_labels = all_labels[:split_point]  
val_docs = all_docs[split_point:]
val_labels = all_labels[split_point:]
#set(train_labels)
#train_docs[0]
print(type(train_labels))

<class 'list'>


#### Estimating parameters for the Naive Bayes classifier


In [121]:
'''
 TODO for each word in each document, calculate the probability of that word 
 occuring in all different types of labeled documents. Then multiply that with
 the prior probability of some doc being under some label, and lastly divide that
 by the total amount of documents of a the particular label.
 start with alpha = 1, as smoothing.
'''

# Counts the frequency of each word per label
def count_words(documents, labels) -> dict:
    label_counts = {}
    for i in range(len(documents)):
        if labels[i] in label_counts.keys():
            label_counts[labels[i]].update(documents[i])
        else:
            label_counts.update({labels[i]:Counter(documents[i])})
    return label_counts

# Counts the total frequency of all words per label
def count_total_words(word_count) -> dict:
    words_per_label = {}
    for label in word_count.keys():
        words_per_label.update({label:0})
        for _ in word_count[label]:
            words_per_label[label] += 1
    return words_per_label


# Trains a Naive Bayes model with given data
def train_nb(documents, labels) -> dict:
    # The count of each word for each label
    label_counts = count_words(documents, labels)
    label_probs = {}
    # The total amount for each label
    label_word_freq = count_total_words(label_counts)
    for label in label_counts.keys():
        label_probs.update({label:{}})
        for word in label_counts[label]:
            word_count = label_counts[label][word]
            # Adding the probability (converted to logarithmic scale)
            label_probs[label].update(
                {word:word_count/label_word_freq[label]}
                )
    return label_probs

res = count_words(train_docs, train_labels);
print(res['pos']['you'])
#print([i for i in res['pos']])
tes = Counter(train_labels)
#print(tes)
#res['neg']['i']
train = train_nb(train_docs, train_labels)
train
#print(count_total_words(res))

5808


{'neg': {'i': 0.4915269196822595,
  'bought': 0.02018240659017358,
  'this': 0.28267137393350983,
  'album': 0.0215651662253604,
  'because': 0.03480435422182995,
  'loved': 0.0036775522212415417,
  'the': 0.9389232127096204,
  'title': 0.003236245954692557,
  'song': 0.010767872903795234,
  '.': 0.9026772580170639,
  'it': 0.36298911444542514,
  "'s": 0.1403059723448073,
  'such': 0.013327449249779346,
  'a': 0.4370991468078847,
  'great': 0.024242424242424242,
  ',': 0.7747278611356281,
  'how': 0.027684613121506326,
  'bad': 0.020859076198882023,
  'can': 0.04054133568696675,
  'rest': 0.005619299794057076,
  'of': 0.3793762871432774,
  'be': 0.09067372756693146,
  'right': 0.010797293321565166,
  '?': 0.04554280670785525,
  'well': 0.023889379229185055,
  'songs': 0.012297734627831715,
  'are': 0.10205942924389526,
  'just': 0.059458664313033247,
  'filler': 0.00088261253309797,
  'and': 0.4698440717858194,
  "n't": 0.12280082377169756,
  'worth': 0.0088261253309797,
  'money': 0.0

#### Classifying new documents

In [234]:
# Calculates the score of a document given a label and a trained naive bayes model
def score_doc_label(document, label, word_count, word_probs) -> float:
    # Adding smoothing -> 1/(total words for label)
    word_scores = []
    total_words = word_count[label]
    for word in document:
        if word not in word_probs[label].keys():
            word_scores.append(1/total_words)
        else:
            word_scores.append(word_probs[label][word])
    # Previous score
    #doc_score = functools.reduce(lambda x, y: x*y, word_scores)
    likelihoods = functools.reduce(lambda x, y: x*y, word_scores)
    prior = total_words/sum(word_count.values())
    doc_score = likelihoods*prior

    return doc_score

word_count = count_total_words(count_words(train_docs, train_labels))
res = score_doc_label(['a', 'top-quality', 'performance'], 'neg', word_count, train)


# Classifies the document given a trained naive bayes model
# Returns a tuple containing the classification and the score
def classify_nb(document, word_count, word_probs) -> tuple:
    result_label = ''
    # Set to smallest possible value, such that log probs works
    maximum = float('-inf')
    for label in word_probs.keys():
        score = score_doc_label(document, label, word_count, word_probs)
        if score >= maximum:
            maximum = score
            result_label = label
    return (result_label, maximum)

(result, score) = classify_nb(['a', 'top-quality', 'performance'], word_count, train)
result

'pos'

#### Evaluating the classifier

In [266]:
# Classifies a set of documents (validation data) from a trained naive bayes model
def classify_documents(docs, word_count, word_probs) -> list:
    results = []
    for doc in docs:
        # Appends the label of the classification
        results.append(classify_nb(doc, word_count, word_probs)[0])
    return results

# Computes the accuracy of a given classification result
def accuracy(true_labels, predicted_labels) -> float:
    correct = 0
    total = len(true_labels)
    for i in range(total):
        if true_labels[i] == predicted_labels[i]:
            correct += 1
    return correct / total

# Can be used to calculate true positives, true negatives,
# false positives and false negatives
def calc_metric(true_labels, predicted_labels, label, bin_op1, bin_op2) -> int:
    res = 0
    for i in range(len(true_labels)):
        comparison1 = bin_op1(true_labels[i], label)
        comparison2 = bin_op2(predicted_labels[i], label)
        if comparison1 and comparison2:
            res += 1
    return res

# Calculates the f1 score given true outcomes, predicted outcomes and true label
def calc_f1_score(t_labels, p_labels, t_label) -> float:
    true = lambda x, y:x==y
    false = lambda x, y:x!=y
    tp = calc_metric(t_labels, p_labels, t_label, true, true)
    fn = calc_metric(t_labels, p_labels, t_label, true, false)
    fp = calc_metric(t_labels, p_labels, t_label, false, true)

    # To avoid division by zero, in this case just return 0
    if ((fp+tp) == 0 or tp+fn == 0):
        return 0
    precision = tp / (fp+tp)
    recall = tp / (tp+fn)
    score = 2 * ((precision*recall) / (precision+recall))
    return score

predictions = classify_documents(val_docs, word_count, train)
print('Accuracy:', accuracy(val_labels, predictions))
print('Computed f1_score:', calc_f1_score(val_labels, predictions, 'pos'))

Accuracy: 0.7058329836340747
Computed f1_score: 0.6880284824210057
Scikit-learn f1_score: 0.6880284824210057


**Discussion about the differences between accuracy score and f1 score**

*Accuracy* is easy to interpert, the statistic is obvious in what it stands for; x% accuracy means that x% was correctly classified. But it lacks in taking the distribution into consideration. 

This is where the *f1-score* compliments the accuracy score. The f1-score will provide a better assesment of inbalanced data, it also considers the impact of having a false negative, which in many cases is highly relevant, for instance in medical measurments. 
The f1-score could be viewed as more strict and a more broad measurment, however, it is harder to interpret than the accuracy.

#### Error analysis

In [169]:
# Finds the miss-classified documents and stores relevant information about them in a dict
def find_miss_classifications(true_labels, val_docs, nb) -> list:
    result = []
    word_count = count_total_words(count_words(val_docs, true_labels))
    for i in range(len(true_labels)):
        (label, score) = classify_nb(val_docs[i], word_count, nb)
        if true_labels[i] != label:
            result.append({
                    'Expected Label': true_labels[i], 
                    'Predicted label': label,
                    'Score': np.log(score),
                    'Document': val_docs[i]
                    })
    return result

# Finds the worst classifications, the ones who are wrong and have a relative high probability
# given the amount to output and the classifications 
def find_worst_predictions(amount, predictions) -> list:
    worst_predictions = []
    for prediction in predictions:
        if len(worst_predictions) >= amount:
            # Sort the list of dictionaries by score, lowest last (since it is the least bad of the bunch)
            worst_predictions.sort(key=lambda x: x['Score'], reverse=True)
            if prediction['Score'] > worst_predictions[-1]['Score']:
                # Remove the best and replace it with a worse one
                worst_predictions.pop()  
                worst_predictions.append(prediction)
        else:
            worst_predictions.append(prediction)
    return worst_predictions

miss_classifications = find_miss_classifications(val_labels, val_docs, train)
worst_predictions = find_worst_predictions(5, miss_classifications)
for i in range(len(worst_predictions)):
    print(f'{i + 1}.\n{worst_predictions[i]} \n')



1.
{'Expected Label': 'pos', 'Predicted label': 'neg', 'Score': -9.335209354022052, 'Document': ['goo']} 

2.
{'Expected Label': 'neg', 'Predicted label': 'pos', 'Score': -20.054215050028432, 'Document': ['it', "'s", 'not', 'great', 'music', 'to', 'dance', 'to']} 

3.
{'Expected Label': 'pos', 'Predicted label': 'neg', 'Score': -45.096324008938005, 'Document': ['this', 'was', 'not', 'a', 'waste', 'of', 'brad', "'s", 'time', 'or', 'mine', '.', 'good', 'job', '!']} 

4.
{'Expected Label': 'neg', 'Predicted label': 'pos', 'Score': -47.060936230416246, 'Document': ['this', 'lifts', 'your', 'heart', 'to', 'god', 'as', 'you', 'listen', '.', 'beautifully', 'done']} 

5.
{'Expected Label': 'pos', 'Predicted label': 'neg', 'Score': -48.91165086129675, 'Document': ['you', 'will', 'never', 'look', 'at', 'these', 'birds', 'the', 'same', 'again', '-', 'fansinating']} 



  'Score': np.log(score),


**Notes on the error analysis**

From the functions above and its output, we can identify the five worst classifications, i.e. the miss-classifications with highest score, from top to bottom.
The scores are displayed on a logarithmic scale.

The worst classification is in some sense not relevant, since it shouldn't be viewed as a meaningful review, given its only one word and not a real word at that. The relative high score is due to the few words.
The other ones show that the model isn't ideal, however, one could argue that they're not obvious if you view them as seperate words. 
This points at the flaw of the Naive Bayes model; no context is taken into consideration, every word is viewed as independent. This leads to some seemingly very obvious classifications of the reviews resulting in miss-classifications.
I think that the fifth review above is a good example of a review that's pretty obviously positive, but no words in isolation indicates that this is more of a positive review than a negative one, thus the classification could either be negative or positive, in our case negative. 
If some type of context was considered, in a more involved model, the fifth review would proabably not be miss-classified.

#### Cross-validation

**10-fold cross validation**

In [264]:
'''
TODO for N iterations, in our case 10, split the in two distinct chunks 10 times over,
check the score for each result and combine them.
It's only going to validate a chunk at a time, so when the iteration is done, one can 
combine all the results and validate it for all the true labels, not only the validation true labels
'''

def n_fold_cross_validation(n: int, docs: list, labels: list):
    # To store all the predictions
    results = []

    for fold_nbr in range(n):
        split_point_1 = int(float(fold_nbr)/n*len(docs)) 
        split_point_2 = int(float(fold_nbr+1)/n*len(docs)) 

        train_docs_fold = docs[:split_point_1] + docs[split_point_2:]
        train_labels_fold = labels[:split_point_2] + labels[split_point_2:]
        val_docs_fold = docs[split_point_1:split_point_2]

        # Train the classifier
        word_probs = train_nb(train_docs_fold, train_labels_fold)
        word_count = count_total_words(count_words(train_docs_fold, train_labels_fold))
        # Apply the classifier to val_docs_fold
        predictions = classify_documents(val_docs_fold, word_count, word_probs)
        results.append(predictions)

    # Flatten the array
    results = sum(results, []) 

    return results

# 10 fold cross validation
predictions = n_fold_cross_validation(10, all_docs, all_labels)
print('Metrics for 10 fold cross validation')
print('Accuracy:', accuracy(all_labels, predictions))
print('Computed f1_score:', calc_f1_score(all_labels, predictions, 'pos'))
print('Scikit-learn f1_score:', f1_score(all_labels, predictions, pos_label='pos'))




Metrics for 10 fold cross validation
Accuracy: 0.6432768171898606
Computed f1_score: 0.6791484221651819
Scikit-learn f1_score: 0.6791484221651819


**Leave-one-out cross validation**

In [188]:
# Leave-one-out cross validation
# This one takes approximately 52 minutes to compute on my hardware
predictions = n_fold_cross_validation(len(all_docs), all_docs, all_labels)
print('Metrics for Leave-one-out cross validation')
print('Accuracy:', accuracy(all_labels, predictions))
print('Computed f1_score:', calc_f1_score(all_labels, predictions, 'pos'))
print('Scikit-learn f1_score:', f1_score(all_labels, predictions, pos_label='pos'))


11914
11914
Metrics for Leave-one-out cross validation
Accuracy: 0.6569582004364614
Computed f1_score: 0.651547446500128
Scikit-learn f1_score: 0.651547446500128


**Difference between 10-fold cross validation and Leave-one-out cross validation**

The difference between the two validation methods lies mainly in how big the data set is. In a world with endlessly fast computational methods Leave-one-out cross validation would almost always be prefered, since it trains the data on every single data point, but more importantly tests each data point one at a time. However, for very large data sets, using Leave-one-out cross validation isn't realistic and perhaps not even neccessary given the time it takes and the accuarcy one can archieve using only k-fold cross validation. In our case, 10-fold cross validation is a sufficient validation metric, since it gives a very close score to Leave-one-out cross validation, in a fraction of the time. In the model for the Iris data set however, Leave-one-out cross validation could be prefered, since it wouldn't take much time to run it, due to the relatively small data set (150 data points) and the fact that it validates each and every data point. 

#### Naive Bayes for numerical data

In [392]:
flower_data = pd.read_csv("iris.csv")
flower_X = flower_data.iloc[:,:-1].to_numpy() #flower_data.drop('species', axis=1)
flower_Y = flower_data['species'].to_numpy()

'''
Then when you predict you assume that they follow a gaussian distribution, with the given parameters.
Features: sepal length (x1), sepal_width (x2), petal_length (x3), petal_width (x4)
Likelihood: P(X = [x1,x2,x3,x4] | Y = y) (uses PDF)
Prior: P(Y = y)
'''
def train_gnb(X, labels) -> dict:
    label_freqs = Counter(labels) 
    label_values = {}
    vector_len = len(X[0])
    i = 0

    # Compute statistic for each X for each label
    for label in label_freqs.keys():
        frequency = label_freqs[label]
        # Split X into the chunk of the specific label
        label_X = X[i:i+frequency]        
        label_values.update({label:{}})
        # Compute statistic for each column for each label 
        for j in range(vector_len):
            x_mean = np.mean(label_X[j])
            x_std = np.std(label_X[j])
            label_values[label].update({f'x{j}':{
                'mean':x_mean,
                'std':x_std
            }})
        i += frequency

    return label_values

def score_label(values, label, frequencies, model):
    likelihoods = []
    for i in range(len(values)):
        mean = model[label][f'x{i}']['mean']
        std = model[label][f'x{i}']['std']
        #This could be incorrect
        likelihood = (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((values[i]-mean)**2 / (2 * std**2 )))
        likelihoods.append(likelihood)
    
    prior = frequencies[label] / sum(frequencies.values())
    score = np.log(functools.reduce(lambda x, y: x*y, likelihoods) * prior)

    return score
        

def classify_gnb(x, frequencies, model):
    result_label = ''
    maximum = float('-inf')
    for label in model.keys():
        score = score_label(x, label, frequencies, model)
        if score >= maximum:
            maximum = score
            result_label = label
    return (result_label, score)

# Classifies iris data (validation data) from a trained Gaussian naive bayes model
def classify_all(X, frequencies, model) -> list:
    results = []
    for x in X:
        # Appends the label of the classification
        results.append(classify_gnb(x, frequencies, model)[0])
    return results
        
r = train_gnb(flower_X, flower_Y)
print(r)
classify_gnb([5.1,3.5,1.4,0.2], Counter(flower_Y), r)


{'setosa': {'x0': {'mean': 2.55, 'std': 1.8874586088176872}, 'x1': {'mean': 2.375, 'std': 1.7640507362318127}, 'x2': {'mean': 2.35, 'std': 1.7298843892006195}, 'x3': {'mean': 2.3499999999999996, 'std': 1.6560495161679194}}, 'versicolor': {'x0': {'mean': 4.074999999999999, 'std': 2.0535031044534606}, 'x1': {'mean': 3.9000000000000004, 'std': 1.7930421077041108}, 'x2': {'mean': 4.1, 'std': 2.0149441679609885}, 'x3': {'mean': 3.2750000000000004, 'std': 1.6068213964221414}}, 'virginica': {'x0': {'mean': 4.525, 'std': 1.6528384676065595}, 'x1': {'mean': 3.875, 'std': 1.6192204914711275}, 'x2': {'mean': 4.525, 'std': 2.044963324854507}, 'x3': {'mean': 4.1499999999999995, 'std': 1.8580904176062045}}}


('setosa', -10.608272345678214)

In [394]:
# Validating the gaussian nb model
def n_fold_cross_validation_nb(N: int, data):
    # To store all the predictions
    results = []
    #print(random.shuffle(data.to_numpy()))
    
    docs = list(data.iloc[:,:-1].to_numpy())
    labels = list(data['species'].to_numpy())

    #print(pd.DataFrame(random.shuffle(data.to_numpy())))

    for fold_nbr in range(N):
        split_point_1 = int(float(fold_nbr)/N*len(docs))
        split_point_2 = int(float(fold_nbr+1)/N*len(docs))

        train_docs_fold = docs[:split_point_1] + docs[split_point_2:]
        train_labels_fold = labels[:split_point_1] + labels[split_point_2:]
        val_docs_fold = docs[split_point_1:split_point_2]
       

        # Train the classifier
        model = train_gnb(train_docs_fold, train_labels_fold)
        frequencies = Counter(labels) 
        # Apply the classifier to val_docs_fold
        predictions = classify_all(val_docs_fold, frequencies, model)
        results.append(predictions)

    # Flatten the array
    results = sum(results, []) 

    return results

#flower_data = flower_data.sample(frac=1)


predictions = n_fold_cross_validation_nb(10, flower_data)
print('Metrics for 10 fold cross validation')
print('Accuracy:', accuracy(flower_Y, predictions))
print('F1_score for setosa:', calc_f1_score(flower_Y, predictions, 'setosa'))
print('F1_score for versicolor:', calc_f1_score(flower_Y, predictions, 'versicolor'))
print('F1_score for virginica:', calc_f1_score(flower_Y, predictions, 'virginica'))
    

Metrics for 10 fold cross validation
Accuracy: 0.6
F1_score for setosa: 0.9523809523809523
F1_score for versicolor: 0.5384615384615384
F1_score for virginica: 0.15384615384615383


**Discussion about the results**
As shown by the f1-scores the model performs very well on 'setosa', worse on 'versicolor' and very bad on 'virginica'. This is probably due to some error leading the model to start with great predictions, but increasingly bad when the category jumps from the first to the second, and the second to the third. I suspect that it has something to do with the data being ordered, and the model being biased to the order in which it classifies, but I tried to randomise the order and it didn't produce a better result, so I suspect that I've missed something in my implementation of the model. Unfortunately, due to the time, I wasn't able to find this error. That being said, the model works well initially, but has a drastic decrease in performance when trying to classify the other two species