# FastText Pre-Trained Model Testing

In [1]:
# data manipulation
import pandas as pd
from sklearn.model_selection import train_test_split # train-test split

# model
import fasttext

# model metrics
from sklearn.metrics import f1_score, precision_score, recall_score 

## Data Pre-Processing
### Conventional and Cryptonews Data

In [2]:
# load data
sample_crypto = pd.read_csv("data/sample_crypto.csv", header=0)

# remove any whitespaces
sample_crypto["title"] = sample_crypto["title"].apply(lambda x: x.replace("\n",""))
sample_crypto["excerpt"] = sample_crypto["excerpt"].apply(lambda x: x if pd.isnull(x) else x.replace("\n",""))

# combine title and excerpt
sample_crypto["text"] = sample_crypto["title"].fillna('') + " " + sample_crypto["excerpt"].fillna('')

# sample text
sample_crypto_text = sample_crypto[["title", "text", "label"]]
sample_crypto_excerpt = sample_crypto[["excerpt", "label"]]
sample_crypto_excerpt = sample_crypto_excerpt.dropna(subset=["excerpt"])

# separate X and y
X_crypto_text = sample_crypto_text[["title", "text"]]
y_crypto_text = sample_crypto_text["label"]
X_crypto_excerpt = sample_crypto_excerpt["excerpt"]
y_crypto_excerpt = sample_crypto_excerpt["label"]

### Reddit Data

In [3]:
sample_reddit = pd.read_csv("data/sample_reddit.csv", header=0)[["title", "excerpt", "label"]]

# combine title and excerpt (if any)
sample_reddit["text"] = sample_reddit["title"].fillna('') + " " + sample_reddit["excerpt"].fillna('')

# remove any whitespaces in text
sample_reddit["text"] = sample_reddit["text"].apply(lambda x: x.replace("\n",""))

# separate X and y
X_reddit = sample_reddit["text"]
y_reddit = sample_reddit["label"]

### Twitter Data

In [4]:
sample_twitter = pd.read_csv("data/sample_twitter.csv", header=0)[["text", "label"]]

# remove any whitespaces in text
sample_twitter["text"] = sample_twitter["text"].apply(lambda x: x.replace("\n",""))

# separate X and y
X_twitter = sample_twitter["text"]
y_twitter = sample_twitter["label"]

### Combined Data

In [5]:
sample_combined = pd.concat([sample_crypto[["text", "label"]], sample_reddit[["text", "label"]], sample_twitter])

# separate X and y
X_combined = sample_combined["text"]
y_combined = sample_combined["label"]

## Amazon Polarity Review Model

In [6]:
# load model
model_amazon = fasttext.load_model("utils/fasttext/amazon_review_polarity.ftz")



### Conventional and Cryptonews Data
#### Title Only

In [7]:
y_crypto_title_pred = [model_amazon.predict(x)[0][0][-1] for x in X_crypto_text.title]
y_crypto_title_pred = [1 if (x=='1') else 0 for x in y_crypto_title_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_crypto_text, y_crypto_title_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_crypto_text, y_crypto_title_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_crypto_text, y_crypto_title_pred, average="binary", pos_label=1))

metrics
precision score: 0.20654044750430292
recall score: 0.6798866855524079
f1 score: 0.3168316831683168


#### Excerpt Only

In [8]:
y_crypto_excerpt_pred = [model_amazon.predict(x)[0][0][-1] for x in X_crypto_excerpt.excerpt]
y_crypto_excerpt_pred = [1 if (x=='1') else 0 for x in y_crypto_excerpt_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_crypto_excerpt, y_crypto_excerpt_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_crypto_excerpt, y_crypto_excerpt_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_crypto_excerpt, y_crypto_excerpt_pred, average="binary", pos_label=1))

metrics
precision score: 0.27906976744186046
recall score: 0.5944272445820433
f1 score: 0.37982195845697325


#### Title and Excerpt Combined

In [9]:
y_crypto_text_pred = [model_amazon.predict(x)[0][0][-1] for x in X_crypto_text.text]
y_crypto_text_pred = [1 if (x=='1') else 0 for x in y_crypto_text_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_crypto_text, y_crypto_text_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_crypto_text, y_crypto_text_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_crypto_text, y_crypto_text_pred, average="binary", pos_label=1))

metrics
precision score: 0.2784313725490196
recall score: 0.603399433427762
f1 score: 0.3810375670840787


### Reddit Data

In [10]:
y_reddit_pred = [model_amazon.predict(x)[0][0][-1] for x in X_reddit]
y_reddit_pred = [1 if (x=='1') else 0 for x in y_reddit_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_reddit, y_reddit_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_reddit, y_reddit_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_reddit, y_reddit_pred, average="binary", pos_label=1))

metrics
precision score: 0.4281263907432132
recall score: 0.8166383701188455
f1 score: 0.5617518248175183


### Twitter Data

In [11]:
y_twitter_pred = [model_amazon.predict(x)[0][0][-1] for x in X_twitter]
y_twitter_pred = [1 if (x=='1') else 0 for x in y_twitter_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_twitter, y_twitter_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_twitter, y_twitter_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_twitter, y_twitter_pred, average="binary", pos_label=1))

metrics
precision score: 0.5852713178294574
recall score: 0.7365853658536585
f1 score: 0.6522678185745141


### Combined Data

In [12]:
y_combined_pred = [model_amazon.predict(x)[0][0][-1] for x in X_combined]
y_combined_pred = [1 if (x=='1') else 0 for x in y_combined_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_combined, y_combined_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_combined, y_combined_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_combined, y_combined_pred, average="binary", pos_label=1))

metrics
precision score: 0.4055045871559633
recall score: 0.7638248847926268
f1 score: 0.5297642828605673


## Yelp Polarity Review Model 

In [13]:
# load model
model_yelp = fasttext.load_model("utils/fasttext/yelp_review_polarity.ftz")



### Conventional and Cryptonews Data
#### Title Only

In [14]:
y_crypto_title_pred = [model_yelp.predict(x)[0][0][-1] for x in X_crypto_text.title]
y_crypto_title_pred = [1 if (x=='1') else 0 for x in y_crypto_title_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_crypto_text, y_crypto_title_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_crypto_text, y_crypto_title_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_crypto_text, y_crypto_title_pred, average="binary", pos_label=1))

metrics
precision score: 0.1981981981981982
recall score: 0.9348441926345609
f1 score: 0.3270564915758176


#### Excerpt Only

In [15]:
y_crypto_excerpt_pred = [model_yelp.predict(x)[0][0][-1] for x in X_crypto_excerpt.excerpt]
y_crypto_excerpt_pred = [1 if (x=='1') else 0 for x in y_crypto_excerpt_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_crypto_excerpt, y_crypto_excerpt_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_crypto_excerpt, y_crypto_excerpt_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_crypto_excerpt, y_crypto_excerpt_pred, average="binary", pos_label=1))

metrics
precision score: 0.2357581069237511
recall score: 0.8328173374613003
f1 score: 0.36748633879781417


#### Title and Excerpt Combined

In [16]:
y_crypto_text_pred = [model_yelp.predict(x)[0][0][-1] for x in X_crypto_text.text]
y_crypto_text_pred = [1 if (x=='1') else 0 for x in y_crypto_text_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_crypto_text, y_crypto_text_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_crypto_text, y_crypto_text_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_crypto_text, y_crypto_text_pred, average="binary", pos_label=1))

metrics
precision score: 0.23101018010963195
recall score: 0.8356940509915014
f1 score: 0.36196319018404904


### Reddit Data

In [17]:
y_reddit_pred = [model_yelp.predict(x)[0][0][-1] for x in X_reddit]
y_reddit_pred = [1 if (x=='1') else 0 for x in y_reddit_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_reddit, y_reddit_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_reddit, y_reddit_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_reddit, y_reddit_pred, average="binary", pos_label=1))

metrics
precision score: 0.3744697261858851
recall score: 0.8242784380305602
f1 score: 0.514982763192787


### Twitter Data

In [18]:
y_twitter_pred = [model_yelp.predict(x)[0][0][-1] for x in X_twitter]
y_twitter_pred = [1 if (x=='1') else 0 for x in y_twitter_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_twitter, y_twitter_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_twitter, y_twitter_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_twitter, y_twitter_pred, average="binary", pos_label=1))

metrics
precision score: 0.571875
recall score: 0.8926829268292683
f1 score: 0.6971428571428571


### Combined Data

In [19]:
y_combined_pred = [model_yelp.predict(x)[0][0][-1] for x in X_combined]
y_combined_pred = [1 if (x=='1') else 0 for x in y_combined_pred] # assign to 1 if risk, 0 otherwise

print("metrics")
print("precision score:", precision_score(y_combined, y_combined_pred, average="binary", pos_label=1))
print("recall score:", recall_score(y_combined, y_combined_pred, average="binary", pos_label=1))
print("f1 score:", f1_score(y_combined, y_combined_pred, average="binary", pos_label=1))

metrics
precision score: 0.3458233890214797
recall score: 0.8346774193548387
f1 score: 0.48903138710766114
