# Import Libraries

In [2]:
import torch
import os
import pandas as pd
import numpy as np
from d2l import torch as d2l
from torch import nn
from sklearn.model_selection import train_test_split
import csv

# Load Dataset

Here, we load the training set into a pandas dataframe and print the first 5 values

In [3]:
kaggle_data = pd.read_json('train.json')
kaggle_data.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


# Split the dataset into train and test

In this section, the kaggle dataset is split into train and test variables.
The train set contains the first 90% of the data (6126 examples)
The test set contains the last 10% of the data (681 examples)

In [8]:
# Split the first 90% of the data as the train set
test_size = round(len(kaggle_data)*0.1) 


# train_set, test_set = train_test_split(kaggle_data, test_size=len(kaggle_data)-train_size, random_state = False)
test_set = kaggle_data.iloc[:test_size]
train_set = kaggle_data.iloc[test_size:]


# train_set
print(len(test_set))
print(len(train_set))

681
6126


### Visually inspect the train and test set

In [9]:
# Print the train set
train_set.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
681,8989,Reflection - Mind Mapping\n\nChallenge\n\nI am...,"[Reflection, -, Mind, Mapping, \n\n, Challenge...","[True, True, True, False, False, False, False,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
682,8993,Visualization\n\nChallenge\n\nSince I am an ev...,"[Visualization, \n\n, Challenge, \n\n, Since, ...","[False, False, False, False, True, True, True,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
683,8995,Visualization\n\nChallenge\n\nWhen I worked as...,"[Visualization, \n\n, Challenge, \n\n, When, I...","[False, False, False, False, True, True, True,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
684,8996,Visualizations are one of the important parts ...,"[Visualizations, are, one, of, the, important,...","[True, True, True, True, True, True, True, Tru...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
685,8997,Reflection – MIND MAPPING\n\nChallenge & Selec...,"[Reflection, –, MIND, MAPPING, \n\n, Challenge...","[True, True, True, False, False, True, True, F...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [10]:
# Print the test set
test_set.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


# Extract labels and count them

The Most Frequent Count (MFC) assigns the most common class in a dataset as the prediction for every example.
As such, we need to count the frequency of every class/label in the dataset. 

We used a dictionary to store the counts of each class.

In [11]:
# Assign labels to a variable
labels = kaggle_data['labels']

# Initialize a dictionary to count the labels
count = dict()

# Loop through labels and count them
for entry in labels:
    for label in entry:
        if label not in count:
            count[label]=1
        else:
            count[label]+=1

# Sort labels in descending order and print the 2 most frequent classes

The dictionary stores its entries in the order that they are added. 
As such, we need to sort the entries 
of the dictionary to get the first and second most frequent classes

In [12]:
# Sort the labels in descending order
sorted_counts = sorted(count, key=count.get)
sorted_counts

# Get the most frequent class
most_frequent = sorted_counts[-1]

# Get the second most frequent class
second_frequent = sorted_counts[-2]

print(sorted_counts)
print(most_frequent)
print(second_frequent)

['I-URL_PERSONAL', 'I-ID_NUM', 'B-STREET_ADDRESS', 'B-USERNAME', 'B-PHONE_NUM', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'B-EMAIL', 'B-ID_NUM', 'B-URL_PERSONAL', 'I-NAME_STUDENT', 'B-NAME_STUDENT', 'O']
O
B-NAME_STUDENT


# Define model to predict most frequent class

Here we define our MFC model. It takes 1 arguments: a dataframe containing the train dataset. The model returns 2 CSVs submissions-mfc.csv and evaluations-mfc.csv

In [13]:
def MFC(test_set, frequency):
    """ A model that predicts the second most frequent
    classification in the dataset for each example
    @params test_set A pandas dataframe containing the test dataset
    """

    # Initialize arrays to store evaluations and predictions
    submission_predictions = []
    evaluations = []
    
    # Make predictions on the test values
    count = 0
    for index, row in test_set.iterrows():
        for i in range(len(row['tokens'])):
            if count > -1:
                # Initialize arrays that will be wrriten to the files
                prediction = [-1 , -1, -1, -1]
                evaluation = [-1]

                # if row['labels'][i] != 'O':
                #     print('<>')
                
                # Assign values to prediction and evaluation arrays 
                prediction[0] = count # assign row id
                prediction[1] = row['document']
                prediction[2] = i # assign token number
                if frequency == 'mfc':
                    prediction[3] = most_frequent # assign prediction
                elif frequency == 'nmfc': 
                    prediction[3] = second_frequent # assign prediction
                evaluation[0] = row['labels'][i]


                submission_predictions.append(prediction)
                evaluations.append(evaluation)
                count+=1
            else:
                break  
    print('Predictions complete')


  # Initialize file writers for submissions.csv and evaluations.csv
    with open(f"submissions-{frequency}.csv",'w',newline = '') as f:
        with open(f"evaluation-{frequency}.csv",'w',newline = '') as e:
        # Write column headings to files
            headings = ['row_id', 'document', 'token', 'label']
            submission_writer = csv.writer(f)
            eval_writer = csv.writer(e)
            
            submission_writer.writerow(headings)
            eval_writer.writerow(headings)  
            
             # Write the predictions to the files
            for i in range(len(submission_predictions)):
                submission_writer.writerow(submission_predictions[i])
                eval_writer.writerow(submission_predictions[i][0:3] + evaluations[i])
            
    print(f"Files written to submissions-{frequency}.csv & evaluation-{frequency}.csv")
    return                      

# Perform predictions using most frequent class

In [14]:
MFC(test_set,'mfc')      

Predictions complete
Files written to submissions-mfc.csv & evaluation-mfc.csv


# Perform predictions using the second most frequent class

In [15]:
MFC(test_set,'nmfc')      

Predictions complete
Files written to submissions-nmfc.csv & evaluation-nmfc.csv


# Confusion Matrix

A confusion matrix is a helpful tool when calculating some evaluation statistics like precision and recall. The confusion matrix shows the true negative, true positive, false negative & false positives for each class in the predictions.

Below, we define a function that generates the confusion matrix for the predicted data

In [37]:
def confusion_matrix(predictions_csv, evaluations_csv):
    # Read predictions and evaluations as dataframes
    predictions = pd.read_csv(predictions_csv)
    evaluations = pd.read_csv(evaluations_csv)

    # Initialize a dataframe to store the matrix
          
    # Specify heading names for columns and rows
    headings =['I-URL_PERSONAL', 'I-ID_NUM', 'B-STREET_ADDRESS', 
             'B-USERNAME', 'B-PHONE_NUM', 'I-PHONE_NUM', 'I-STREET_ADDRESS',
             'B-EMAIL', 'B-ID_NUM', 'B-URL_PERSONAL', 'I-NAME_STUDENT', 'B-NAME_STUDENT', 'O']

    # Create an empty DataFrame with specified column and row n
    matrix = pd.DataFrame(columns=headings, index= headings)

    # Fill DataFrame with zeros
    matrix = matrix.fillna(0)

    # Count true and false positives for each class
    for i in range(len(predictions)):
        predicted_label = predictions.iloc[i]['label'] 
        ground_truth = evaluations.iloc[i]['label']
        if predicted_label == ground_truth:
            matrix[ground_truth][ground_truth] +=1
        else:
            matrix[ground_truth][predicted_label] +=1

    # Write matrix to csv
    matrix.to_csv('confusion_matrix.csv', index=True)
    
    return matrix

confusion_matrix('submissions-mfc.csv','evaluation-mfc.csv')
    

KeyboardInterrupt: 

### Precision

From our research, we determined that there are 2 methods for calculating Precision in multi-class scenarios: Macro Averaging and Micro Averaging.
We decided to go with a Macro averaging approach because it assigns equal weights to each class. Micro averaging on the other hand, assigns equal weight to each prediction. Assigning equal weight to each prediction means that the overall precision is influenced greatly by datasets with dominant classes. In other words, micro averaging does not really illustrate the models ability to predict less prevalent classes. 

As such, we went with macro averaging since it gives each class equal importance and thus illustrates the models ability to predict classes that are not so frequent in the data.

In [None]:
def Precision(confusion_matrix):

     # Read the confusion matrix as a df
    matrix = pd.read_csv(confusion_matrix)
    headings = matrix['Unnamed: 0']
    print(headings)
    # matrix.index = matrix['Unnamed: 0']
    matrix.drop('Unnamed: 0')
    print(matrix)
    

    # # Initiailize counters for true and false positives
    # true_positives = 0
    # false_positives = 0
    
    
    # # Calculate true positives and false positives
    # for i in range(len(predictions)):
    #     if(predictions.iloc[i]['label'] == evaluations.iloc[i]['label']):
    #         true_positives+=1
    #     elif(predictions.iloc[i]['label'] 
    
        
Precision('confusion_matrix.csv')                            

# b