# Set up

## Package Loading

In [1]:
import numpy as np 
import pandas as pd 
!pip install fasttext
import fasttext
import bz2
import csv
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
import time
import seaborn as sns


[notice] A new release of pip available: 22.1.2 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Set up paths

In [2]:
path_data = '../../../../data/external/Harmful/train_tweet.csv'
path_train = 'train_text.txt'
path_test = 'test_text.txt'
path_valid = 'valid_text.txt'

## Set up functions

In [3]:
def convert_labels(label):
  label = int(label)
  if label == 1:
    return '__label__1'
  else:
    return '__label__0'

#  Train Test split and Data Preparation

In [4]:
df = pd.read_csv(path_data)
df['text'] = df['tweet']
df.drop(columns=['tweet'], inplace = True)

df['label'] = df['label'].apply(convert_labels)
print(df.head())

train, test = train_test_split(df, test_size=0.2, random_state = 453, stratify=df['label'])
test, valid = train_test_split(test, test_size=0.5 , random_state = 453, stratify= test['label'])

   id       label                                               text
0   1  __label__0   @user when a father is dysfunctional and is s...
1   2  __label__0  @user @user thanks for #lyft credit i can't us...
2   3  __label__0                                bihday your majesty
3   4  __label__0  #model   i love u take with u all the time in ...
4   5  __label__0             factsguide: society now    #motivation


# Transformation to txt

In [5]:
df.drop(columns=['id'], inplace = True)
train.to_csv(path_train, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
valid.to_csv(path_valid, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
test_labels = []
test_list = []
for index, row in test.iterrows():
    test_list.append(row['text'])
    test_labels.append(row['label'])

# Modeling

In [6]:
model = fasttext.train_supervised(path_train, label_prefix='__label__', thread=4, epoch = 10)
#model = fasttext.train_supervised(input=path_train, autotuneValidationFile=path_valid, autotuneDuration=600)
print(model.labels, 'are the labels or targets the model is predicting')

['__label__0', '__label__1'] are the labels or targets the model is predicting


# Prediction

In [7]:
test_list = [w.replace('\n', '') for w in test_list]

# Use the predict function 
pred = model.predict(test_list)

# check the first record outputs
print(pred[0][0], 'is the predicted label')

['__label__0'] is the predicted label


In [8]:
labels = [0 if x.split(' ')[0] == '__label__0' else 1 for x in test_labels]
pred_labels = [0 if x == ['__label__0'] else 1 for x in pred[0]]

# run the accuracy measure. 
print(roc_auc_score(labels, pred_labels))
print(f1_score(labels, pred_labels, average='macro'))

0.7672292587963854
0.82430389489019
