# Set up

## Package Loading

In [5]:
import numpy as np 
import pandas as pd 
!pip install fasttext
import fasttext
import bz2
import csv
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Set up paths

In [6]:
# Parent paths
google_path = '/content/drive/My Drive/'
parent_path = google_path
folder_path = 'Web_Mining_Project/fasttext/3labels'

path_shortend_dataset = parent_path + "Web_Mining_Project/data/shortend10000_dataset.csv"
path_train = parent_path + folder_path + 'train_text.txt'
path_test = parent_path + folder_path + 'test_text.txt'
path_valid = parent_path + folder_path + 'valid_text.txt'

## Set up functions

In [7]:
# function for sentiment label creating 
def create_labels_from_rating(score):
  score = int(score)
  if score <= 2:
    return 0 #negative
  elif score == 3:
    return 1 #neutral
  else:
    return 2 #positive

def convert_labels(label):
  label = int(label)
  if label == 1:
    return '__label__1'
  elif label == 2:
    return '__label__2'
  else:
    return '__label__0'

#  Train Test split and Data Preparation

In [8]:
# prepare data 
df = pd.read_csv(path_shortend_dataset)
df['label'] = df['overall'].apply(create_labels_from_rating)
df['text'] = df['reviewText']
df.drop(columns=['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'reviewerName', 'summary', 'unixReviewTime', 'vote', 'style', 'image'], inplace = True)
df['label'] = df['label'].apply(convert_labels)
print(df.head())

#train, test, valid
train, test = train_test_split(df, test_size=0.2, random_state = 453, stratify=df['label'])
test, valid = train_test_split(test, test_size=0.5 , random_state = 453, stratify= test['label'])

   Unnamed: 0                                         reviewText       label  \
0           0  This game is a bit hard to get the hang of, bu...  __label__2   
1           1  I played it a while but it was alright. The st...  __label__2   
2           2                                           ok game.  __label__1   
3           3  found the game a bit too complicated, not what...  __label__0   
4           4  great game, I love it and have played it since...  __label__2   

                                                text  
0  This game is a bit hard to get the hang of, bu...  
1  I played it a while but it was alright. The st...  
2                                           ok game.  
3  found the game a bit too complicated, not what...  
4  great game, I love it and have played it since...  


# Transformation to txt

In [9]:
#create needed txts
train.to_csv(path_train, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
valid.to_csv(path_valid, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
test_labels = []
test_text = []
for index, row in test.iterrows():
    test_text.append(row['text'])
    test_labels.append(row['label'])

# Modeling

In [10]:
# without hyperparameter optimization
#model = fasttext.train_supervised(path_train, label_prefix='__label__', thread=4, epoch = 10)

# with hyperparameter optimization
model = fasttext.train_supervised(input=path_train, autotuneValidationFile=path_valid, autotuneDuration=600)


# Prediction

In [11]:
test_text = [x.replace('\n', '') for x in test_text]

# Use the predict function 
prediction = model.predict(test_text)

# check the first record outputs
print(prediction[0][0], 'prediction for first item')

['__label__2'] prediction for first item


In [12]:
labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_labels]
prediction_labels = [0 if x == ['__label__1'] else 1 for x in prediction[0]]

# run the accuracy measure. 
#print(roc_auc_score(labels, pred_labels))
print(f1_score(labels, prediction_labels, average = 'weighted'))

0.8945935145392645
