In [1]:
# Load all libraries and variables
import os,sys  
import pandas as pd
import numpy as np
import re
import fasttext

data_path = './data/amazon_review_polarity_csv/'

In [2]:
def clean_train_dataset(dataframe, shuffle = False, encode_ascii = False, clean_strings = False, label_prefix = '__label__'):
    # Transform train file
    df = dataframe[['name','description']].apply(lambda x: x.str.replace(',',' '))
    df['class'] = label_prefix + dataframe['class'].astype(str) + ' '
    if clean_strings:
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('"',''))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('\'',' \' '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('.',' . '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('(',' ( '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace(')',' ) '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('!',' ! '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('?',' ? '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace(':',' '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace(';',' '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.lower())
    if shuffle:
        df.sample(frac=1).reset_index(drop=True)
    if encode_ascii :
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8'))
    df['name'] = ' ' + df['name'] + ' '
    df['description'] = ' ' + df['description'] + ' '
    return df

In [3]:
%%time
#Load train set
train_file = data_path + 'train.csv'
df_sentiment_train = pd.read_csv(train_file, header = None, names = ['class','name','description'])

#Load test set
test_file = data_path + 'test.csv'
df_sentiment_test = pd.read_csv(test_file, header = None, names = ['class','name','description'])

# Transform datasets
df_train_clean = clean_train_dataset(df_sentiment_train, True, False)
df_test_clean = clean_train_dataset(df_sentiment_test, False, False)

# Write cleaned files to disk
train_file_clean = data_path + 'amazon.train'
df_train_clean.to_csv(train_file_clean, header = None, index = False, columns = ['class','name','description'] )

test_file_clean = data_path + 'amazon.test'
df_test_clean.to_csv(test_file_clean, header = None, index = False, columns = ['class','name','description'] )

CPU times: user 1min 16s, sys: 4.47 s, total: 1min 20s
Wall time: 1min 29s


In [4]:
%%time
# Parameters
dim = 10
lr = 0.1
epoch = 5
min_count = 1
word_ngrams = 2
bucket = 10000000
thread = 12
label_prefix = '__label__'

# Train a classifier
output_file = data_path + 'amazon_model'
classifier = fasttext.supervised(train_file_clean, output_file, dim = dim, lr = lr, epoch = epoch, min_count = min_count, word_ngrams = word_ngrams, bucket = bucket, thread = thread, label_prefix = label_prefix)


CPU times: user 14min 47s, sys: 11.9 s, total: 14min 59s
Wall time: 5min 1s


In [5]:
%%time
# Evaluate classifier
result = classifier.test(test_file_clean)
print('Precision:', result.precision)
print('Recall:', result.recall)
print ('Number of examples:', result.nexamples)

('Precision:', 0.941815)
('Recall:', 0.941815)
('Number of examples:', 400000)
CPU times: user 9.74 s, sys: 84 ms, total: 9.82 s
Wall time: 9.87 s


In [18]:
%%time
# Load real-world validation dataset 
validation_dataset = pd.read_csv(data_path + 'sentences.csv', encoding="UTF-8")
# Preprocessing validation dataset
# Remove b' unicode from comment_message column
validation_dataset['comment_message'] = validation_dataset['comment_message'].apply(lambda x: x[1:])
# Remove emoji characters
validation_dataset[['comment_message']] = validation_dataset[['comment_message']].apply(lambda x: x.str.replace(r'\\x[0-9A-Fa-f]*',''))


class_dict={
    1:"Negative",
    2:"Positive"
}

validation_dataset['label'] = validation_dataset['comment_message'].apply(lambda x: classifier.predict_proba(x)[0][0][0])
validation_dataset['label'] = validation_dataset['label'].apply(lambda x: class_dict[int(x)])
validation_dataset['certainty'] = validation_dataset['comment_message'].apply(lambda x: classifier.predict_proba(x)[0][0][1])

CPU times: user 17.3 s, sys: 60 ms, total: 17.4 s
Wall time: 17.3 s


In [19]:
validation_dataset.head(10)

Unnamed: 0,comment_id,comment_message,label,certainty
0,1538183672909402_1541895645871538,'Thanks big fan here',Positive,0.880859
1,1540621789332257_1540637829330653,'Reminded of trika butterfly',Positive,0.880859
2,1540621789332257_1540640095997093,'BEST WISHES FOR CELEBRATING... 34 YEARS..OF T...,Positive,0.880859
3,1540621789332257_1540672129327223,'Congrats',Positive,0.880859
4,1540621789332257_1540753779319058,'Congrats',Positive,0.880859
5,1540621789332257_1540781092649660,'Congratulations',Positive,0.880859
6,1540621789332257_1541729175888185,'Congratulations Apollo hospital...',Positive,0.880859
7,1540621789332257_1541754942552275,'I will never forget the first class treatment...,Positive,0.880859
8,1540621789332257_1542346075826495,"""Best wishes..and I'm so happy that I was a pa...",Negative,0.998047
9,1538180652909704_1539762569418179,'Good Information which will help in curing pr...,Positive,0.880859


In [17]:
# writing output files to disk
validattion_dataset_output_file = data_path + 'sentences_result.csv'
validation_dataset.to_csv(validattion_dataset_output_file, header = None, index = False, columns = ['comment_id', 'comment_message','label','certainity'])