In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

from constants import LABEL_COLS
from functions import load_bad_words, build_data_path, print_report
from sklearn.model_selection import train_test_split
from random import random

In [2]:
training_data_path = build_data_path('train.csv')

In [3]:
df = pd.read_csv(training_data_path)
X = df['comment_text']
y = df[LABEL_COLS]

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33)

label_counts = y_train.sum()

total = label_counts.sum()

label_freqs = defaultdict(lambda: 0)
for key in label_counts.index:
    label_freqs[key] = label_counts[key]/total

In [5]:
def predict(X_values):
    predictions = [[], []]
    for example in X_values:
        prediction = []
        for key in LABEL_COLS:
            rand_value = random()
            frequency = label_freqs[key]
            prediction.append(1 if rand_value < frequency else 0)
        predictions[0].append(prediction)
        predictions[1].append([1]*len(LABEL_COLS))    
    return np.array(predictions)

In [6]:
random_predictions, always_true = predict(X_valid)

print('RANDOM ASSIGNMENT')
print_report(y_valid, random_predictions)
print()

print('ALWAYS TRUE ASSIGNMENT')
print_report(y_valid, always_true)
print()

RANDOM ASSIGNMENT
VALIDATION RESULTS:

               precision    recall  f1-score   support

        toxic       0.10      0.44      0.16      5159
 severe_toxic       0.01      0.03      0.01       584
      obscene       0.05      0.24      0.09      2855
       threat       0.00      0.01      0.00       147
       insult       0.05      0.22      0.08      2725
identity_hate       0.01      0.03      0.01       471

  avg / total       0.07      0.30      0.11     11941


ALWAYS TRUE ASSIGNMENT
VALIDATION RESULTS:

               precision    recall  f1-score   support

        toxic       0.10      1.00      0.18      5159
 severe_toxic       0.01      1.00      0.02       584
      obscene       0.05      1.00      0.10      2855
       threat       0.00      1.00      0.01       147
       insult       0.05      1.00      0.10      2725
identity_hate       0.01      1.00      0.02       471

  avg / total       0.07      1.00      0.13     11941


