# Set up

## Package Loading

In [2]:
import numpy as np 
import pandas as pd 
!pip install fasttext
import fasttext
import bz2
import csv
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from google.colab import drive
drive.mount('/content/drive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 3.7 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.2-py2.py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3144353 sha256=bfedd0aecba8d734bf361e3cdde252250c6d7b3d7d1667fa4f0cd8858eb05722
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.9.2
Mounted at /content/drive


## Set up paths

In [7]:
# Parent paths
google_path = '/content/drive/My Drive/'
parent_path = google_path
folder_path = 'Web_Mining_Project/fasttext/2labels'

path_shortend_dataset = parent_path + "Web_Mining_Project/data/shortend10000_dataset.csv"
path_train = parent_path + folder_path + 'train_text.txt'
path_test = parent_path + folder_path + 'test_text.txt'
path_valid = parent_path + folder_path + 'valid_text.txt'

## Set up functions

In [8]:
# function for sentiment label creating 
def create_labels_from_rating(score):
  score = int(score)
  if score <= 2:
    return 0 #negative
  elif score == 3:
    return 1 #neutral
  else:
    return 2 #positive

def convert_labels(label):
  label = int(label)
  if label == 1:
    return '__label__1'
  elif label == 2:
    return '__label__2'
  else:
    return '__label__0'

#  Train Test split and Data Preparation

In [9]:
# prepare data 
df = pd.read_csv(path_shortend_dataset)
print(df.head())
df['label'] = df['overall'].apply(create_labels_from_rating)
df = df[df.label != 1]
df['text'] = df['reviewText']
df.drop(columns=['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'reviewerName', 'summary', 'unixReviewTime', 'vote', 'style', 'image'], inplace = True)
df['label'] = df['label'].apply(convert_labels)
print(df.head())

#train, test, valid
train, test = train_test_split(df, test_size=0.2, random_state = 453, stratify=df['label'])
test, valid = train_test_split(test, test_size=0.5 , random_state = 453, stratify= test['label'])

   Unnamed: 0  overall  verified   reviewTime      reviewerID        asin  \
0           0      5.0      True  10 17, 2015  A1HP7NVNPFMA4N  0700026657   
1           1      4.0     False  07 27, 2015  A1JGAP0185YJI6  0700026657   
2           2      3.0      True  02 23, 2015  A1YJWEXHQBWK2B  0700026657   
3           3      2.0      True  02 20, 2015  A2204E1TH211HT  0700026657   
4           4      5.0      True  12 25, 2014  A2RF5B5H74JLPE  0700026657   

        reviewerName                                         reviewText  \
0        Ambrosia075  This game is a bit hard to get the hang of, bu...   
1             travis  I played it a while but it was alright. The st...   
2  Vincent G. Mezera                                           ok game.   
3         Grandma KR  found the game a bit too complicated, not what...   
4                jon  great game, I love it and have played it since...   

                                       summary  unixReviewTime  vote style  \
0       

# Transformation to txt

In [10]:
#create needed txts
train.to_csv(path_train, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
valid.to_csv(path_valid, index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
test_labels = []
test_text = []
for index, row in test.iterrows():
    test_text.append(row['text'])
    test_labels.append(row['label'])

# Modeling

In [11]:
# without hyperparameter optimization
#model = fasttext.train_supervised(path_train, label_prefix='__label__', thread=4, epoch = 10)

# with hyperparameter optimization
model = fasttext.train_supervised(input=path_train, autotuneValidationFile=path_valid, autotuneDuration=600)


# Prediction

In [12]:
test_text = [x.replace('\n', '') for x in test_text]

# Use the predict function 
prediction = model.predict(test_text)

# check the first record outputs
print(prediction[0][0], 'prediction for first item')

['__label__2'] prediction for first item


In [13]:
labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_labels]
prediction_labels = [0 if x == ['__label__1'] else 1 for x in prediction[0]]

# run the accuracy measure. 
#print(roc_auc_score(labels, pred_labels))
print(f1_score(labels, prediction_labels, average = 'weighted'))

1.0
