# Import Libraries

In [1]:
import torch
import os
import pandas as pd
import numpy as np
from d2l import torch as d2l
from torch import nn
from sklearn.model_selection import train_test_split
import csv

# Load Dataset

Here, we load the training set into a pandas dataframe and print the first 5 values

In [2]:
kaggle_data = pd.read_json('train.json')
kaggle_data.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


# Split the dataset into train and test

In this section, the kaggle dataset is split into train and test variables.
The train set contains the first 90% of the data (6126 examples)
The test set contains the last 10% of the data (681 examples)

In [16]:
# Split the first 90% of the data as the train set
test_size = round(len(kaggle_data)*0.1) 
train_size = len(kaggle_data) - test_size

# train_set, test_set = train_test_split(kaggle_data, test_size=len(kaggle_data)-train_size, random_state = False)
train_set = kaggle_data.iloc[:train_size]
test_set = kaggle_data.iloc[train_size:]

# train_set
test_size

681

### Visually inspect the train and test set

In [4]:
# Print the train set
train_set.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [5]:
# Print the test set
test_set.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
6126,21208,Learning Launch\n\nChallenge I am a UX/UI Lea...,"[Learning, Launch, \n\n, Challenge, , I, am, ...","[True, False, False, True, False, True, True, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6127,21209,Reflection – Mind Mapping\n\nChallenge\n\nI am...,"[Reflection, –, Mind, Mapping, \n\n, Challenge...","[True, True, True, False, False, False, False,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6128,21211,Example Reflection - Mind Mapping\n\nChallenge...,"[Example, Reflection, -, Mind, Mapping, \n\n, ...","[True, True, True, True, False, False, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6129,21213,Company General Use\n\nElements:\n\n1. Challen...,"[Company, General, Use, \n\n, Elements, :, \n\...","[True, True, False, False, False, False, False...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6130,21215,1 | P a g e\n\nExample Reflection – Visualizat...,"[1, |, P, a, g, e, \n\n, Example, Reflection, ...","[True, True, True, True, True, False, False, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


# Extract labels and count them

The Most Frequent Count (MFC) assigns the most common class in a dataset as the prediction for every example.
As such, we need to count the frequency of every class/label in the dataset. 

We used a dictionary to store the counts of each class.

In [6]:
# Assign labels to a variable
labels = kaggle_data['labels']

# Initialize a dictionary to count the labels
count = dict()

# Loop through labels and count them
for entry in labels:
    for label in entry:
        if label not in count:
            count[label]=1
        else:
            count[label]+=1

# Sort labels in descending order and print the 2 most frequent classes

The dictionary stores its entries in the order that they are added. 
As such, we need to sort the entries 
of the dictionary to get the first and second most frequent classes

In [7]:
# Sort the labels in descending order
sorted_counts = sorted(count, key=count.get)
sorted_counts

# Get the most frequent class
most_frequent = sorted_counts[-1]

# Get the second most frequent class
second_frequent = sorted_counts[-2]

print(most_frequent)
print(second_frequent)

O
B-NAME_STUDENT


# Define model to predict most frequent class

Here we define our MFC model. It takes 1 arguments: a dataframe containing the train dataset. The model returns 2 CSVs submissions-mfc.csv and evaluations-mfc.csv

In [35]:
def MFC(test_set, frequency):
    """ A model that predicts the second most frequent
    classification in the dataset for each example
    @params test_set A pandas dataframe containing the test dataset
    """

    # Initialize arrays to store evaluations and predictions
    submission_predictions = []
    evaluations = []
    
    # Make predictions on the test values
    count = 0
    for index, row in test_set.iterrows():
        for token in row['tokens']:
            for i in range(len(row['tokens'])):
                if count < 10:
                    # Initialize arrays that will be wrriten to the files
                    prediction = [-1 , -1, -1, -1]
                    evaluation = [-1 , -1, -1, -1]
                    
                    # Assign values to prediction and evaluation arrays 
                    prediction[0] = evaluation[0] = count # assign row id
                    prediction[1] = evaluation[1] = row['document']
                    prediction[2] = evaluation[2] = i # assign token number
                    if frequency == 'mfc':
                        prediction[3] = most_frequent # assign prediction
                    elif frequency == 'nmfc': 
                        prediction[3] = second_frequent # assign prediction
                    evaluation[3] = row['labels'][i]
                    print(prediction)

                    submission_predictions.append(prediction)
                    evaluations.append(evaluation)
                    count+=1
                else:
                    break       


  # Initialize file writers for submissions.csv and evaluations.csv
    with open(f"submissions-{frequency}.csv",'w',newline = '') as f:
        with open(f"evaluation-{frequency}.csv",'w',newline = '') as e:
        # Write column headings to files
            headings = ['row_id', 'document', 'token', 'label']
            submission_writer = csv.writer(f)
            eval_writer = csv.writer(e)
            
            submission_writer.writerow(headings)
            eval_writer.writerow(headings)  
            
             # Write the predictions to the files
            for i in range(len(submission_predictions)):
                submission_writer.writerow(submission_predictions[i])
                eval_writer.writerow(evaluations[i])
            
    print('File writes complete')
    return                      

# Perform predictions using most frequent class

In [36]:
MFC(test_set,'mfc')      

[0, 21208, 0, 'O']
[1, 21208, 1, 'O']
[2, 21208, 2, 'O']
[3, 21208, 3, 'O']
[4, 21208, 4, 'O']
[5, 21208, 5, 'O']
[6, 21208, 6, 'O']
[7, 21208, 7, 'O']
[8, 21208, 8, 'O']
[9, 21208, 9, 'O']
File writes complete


# Perform predictions using the second most frequent class

In [37]:
MFC(test_set,'nmfc')      

[0, 21208, 0, 'B-NAME_STUDENT']
[1, 21208, 1, 'B-NAME_STUDENT']
[2, 21208, 2, 'B-NAME_STUDENT']
[3, 21208, 3, 'B-NAME_STUDENT']
[4, 21208, 4, 'B-NAME_STUDENT']
[5, 21208, 5, 'B-NAME_STUDENT']
[6, 21208, 6, 'B-NAME_STUDENT']
[7, 21208, 7, 'B-NAME_STUDENT']
[8, 21208, 8, 'B-NAME_STUDENT']
[9, 21208, 9, 'B-NAME_STUDENT']
File writes complete


# Define functions for evaluation models

### Precision

From our research, we determined that there are 2 methods for calculating Precision in multi-class scenarios: Macro Averaging and Micro Averaging.
We decided to go with a Macro averaging approach because it assigns equal weights to each class. Micro averaging on the other hand, assigns equal weight to each prediction. Assigning equal weight to each prediction means that the overall precision is influenced greatly by datasets with dominant classes. In other words, micro averaging does not really illustrate the models ability to predict less prevalent classes. 

As such, we went with macro averaging since it gives each class equal importance and thus illustrates the models ability to predict classes that are not so frequent in the data.

In [46]:
def Precision(predictions_csv, evaluations_csv):
    predictions = pd.read_csv(predictions_csv)
    evaluations = pd.read_csv(evaluations_csv)

    # Initiailize counters for true and false positives
    true_positives = 0
    false_positives = 0
    
    
    # Calculate true positives and false positives
    for i in range(len(predictions)):
        if(predictions.iloc[i]['label'] == evaluations.iloc[i]['label']):
            true_positives+=1
        elif(predictions.iloc[i]['label'] 
    
        
Precision('submissions-mfc.csv','evaluation-mfc.csv')                            

O
O
O
O
O
O
O
O
O
O
