In [1]:
# Import
import pandas as pd
import nltk
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import BertTokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as clsr
from sklearn.metrics import confusion_matrix, cohen_kappa_score, plot_confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.manifold import TSNE

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def tokenize(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    input_id = tokenizer.encode(text,
                                add_special_tokens=True, 
                                max_length=128,
                                truncation=True)
    att_mask = [int(token_id > 0) for token_id in input_id]

    batch = {"input_ids": torch.tensor([input_id]), "attention_mask": torch.tensor([att_mask])}
    

    return batch

def predict(text):
    model1.eval()
    with torch.no_grad():
        outputs = model1(**tokenize(text), 
                        token_type_ids=None, 
                        )
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()

    preds_flat = np.argmax(logits, axis=1).flatten()
    return(preds_flat[0])

In [3]:
model1 = torch.load("trained_model.pt", map_location=torch.device('cpu')) 

In [4]:
tokenize("'risk creat last problem share unemploy jobless gt;number month rise number% februari number% novemb rise sharpli peopl lose contact labor market lose connect skill hope http t co qoznumbergwnumberxrh'")

{'input_ids': tensor([[  101,  1005,  3891, 13675,  5243,  2102,  2197,  3291,  3745, 16655,
           8737,  4135,  2100,  3105,  3238, 14181,  1025,  2193,  3204,  4125,
           2193,  1003, 13114,  6820,  8486,  2193,  1003, 13292,  6633,  2497,
           4125,  4629,  3669, 21877,  7361,  2140,  4558,  3967,  4450,  3006,
           4558,  7532,  8066,  3246,  8299,  1056,  2522,  1053, 18153, 19172,
           4059,  7962, 29440,  2595, 25032,  1005,   102]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [5]:
df = pd.read_csv('all_clean.csv')
text=df['text']
usdx_mean_label=df['usdx_mean_label']

In [6]:
label_to_index = {}
index_to_label = {}
label_list = list(set(usdx_mean_label))
for i in range(len(label_list)):
    label = label_list[i]
    label_to_index[label] = i
    index_to_label[i] = label

In [7]:
labels = [label_to_index[label] for label in usdx_mean_label]

In [8]:
train_text, test_text, train_labels, test_labels= train_test_split(df['text'], labels, test_size=0.2, random_state=42)

In [9]:
test_df = test_text.reset_index()
test_df['labels'] = test_labels
test_df = test_df

In [10]:
test_df['predict_label'] = test_df['text'].apply(lambda x: predict(x))[:100]

In [11]:
test_df[(test_df['labels'] ==  0) & (test_df["predict_label"] == 0)]

Unnamed: 0,index,text,labels,predict_label
3,5776,good read find glut offic build http t co znum...,0,0.0
5,3821,fix incom govern bond yield leg number year yi...,0,0.0
15,6357,retreat risk capit push price risk market bond...,0,0.0
17,5014,go withdraw cash discov atm card expir decemb ...,0,0.0
26,5733,induct rang curiou need bougi trend will switch,0,0.0
38,3793,thank charliebilello share chart earli tweet i...,0,0.0
41,469,pmi close number indic stabil normal right dif...,0,0.0
58,2763,invest quot day http t co dntrygnumberqnumberu,0,0.0
75,2091,datum consist strong stagflationari wind consu...,0,0.0
77,1615,strang write tic tri train economist tend use ...,0,0.0


In [12]:
test_df[(test_df['labels'] ==  1)]

Unnamed: 0,index,text,labels,predict_label
0,1188,risk creat last problem share unemploy jobless...,1,1.0
2,247,like go http t co numberfjigpkzt,1,0.0
4,2836,differenti growth rate good import export rise...,1,0.0
7,1859,covid relat hospit nytim deltavari http t co h...,1,1.0
8,4446,look like amaz sport total buy season ticket h...,1,0.0
...,...,...,...,...
1331,881,effici mean public health advic direct trust e...,1,
1333,1253,econom crisi disproportion hurt poor extraordi...,1,
1335,2167,number initi jobless claim fall rel prior week...,1,
1336,3728,http t co fwnumbereikjvyx,1,


In [13]:
test_df[(test_df['labels'] ==  1) & (test_df["predict_label"] == 1)]

Unnamed: 0,index,text,labels,predict_label
0,1188,risk creat last problem share unemploy jobless...,1,1.0
7,1859,covid relat hospit nytim deltavari http t co h...,1,1.0
9,217,mean thing account account glutton punish lond...,1,1.0
10,5369,earn save invest john wesley,1,1.0
14,1593,compani exposur countri risk come oper lazi am...,1,1.0
18,473,sureshkarur vote googl sharehold worth vote no...,1,1.0
27,2432,gm notabl market develop overnight includ reco...,1,1.0
28,3666,wsj inflat polit narr ahead elect season http ...,1,1.0
29,1703,consist compani signal includ hike wage offer ...,1,1.0
30,4773,regard news wide expect trigger cross default ...,1,1.0


In [14]:
test_df[(test_df['labels'] ==  0) & (test_df["predict_label"] == 1)]

Unnamed: 0,index,text,labels,predict_label
1,5711,hous random vacanc number tie hous jeopardi st...,0,1.0
6,334,look forward talk valuat number day seminar wa...,0,1.0
13,4169,lot econom polit social interest today cpi inf...,0,1.0
16,3825,exactli work econ number class take preominant...,0,1.0
19,3360,young un want add addit tomorrow sday http t c...,0,1.0
20,743,power vest cdc withdraw earli guidanc beer o'c...,0,1.0
22,2534,good morn wish good week mix econom activ datu...,0,1.0
24,6224,turkey central bank continu defianc econom fin...,0,1.0
33,5015,headlin cpi rise number number% june tad marke...,0,1.0
34,3023,speak rise yield number year govern bond germa...,0,1.0


In [15]:
test_df[(test_df['labels'] ==  1) & (test_df["predict_label"] == 0)]

Unnamed: 0,index,text,labels,predict_label
2,247,like go http t co numberfjigpkzt,1,0.0
4,2836,differenti growth rate good import export rise...,1,0.0
8,4446,look like amaz sport total buy season ticket h...,1,0.0
11,3767,esg utopian believ problem esg mi measur misus...,1,0.0
12,5963,number year ago crystal ball reveal russian de...,1,0.0
21,5048,think elon regret buy associ internet meme weirdo,1,0.0
23,4271,headlin today dailymailuk ft articl novemb inf...,1,0.0
25,6577,number weekend read http t co qaenumberznumber...,1,0.0
32,5477,new laguardia massiv improv decad ago http t c...,1,0.0
43,3662,spectacular view fli laguardia night http t co...,1,0.0
