In [45]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, GPT2Tokenizer, RobertaTokenizer, XLMTokenizer, XLMModel, XLNetTokenizer, XLNetModel, SqueezeBertTokenizer, SqueezeBertModel
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

In [46]:
# Training is started.

In [47]:
df = pd.read_csv("train_preprocessed.csv")
df.head()

Unnamed: 0,id,tweet,label
0,1,cdc currently reports NUM deaths general discr...,real
1,2,states reported NUM deaths small rise last tue...,real
2,3,politically correct woman almost uses pandemic...,fake
3,4,NUM testing laboratories india NUMth august NU...,real
4,5,populous states generate large case counts loo...,real


In [48]:
# Importing the pre-trained model.
tokenizer = SqueezeBertTokenizer.from_pretrained('squeezebert/squeezebert-mnli-headless')

In [49]:
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'

tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')
len(tokens), len(token_ids)

 Sentence: When was I last outside? I am stuck at home for 2 weeks.
   Tokens: ['when', 'was', 'i', 'last', 'outside', '?', 'i', 'am', 'stuck', 'at', 'home', 'for', '2', 'weeks', '.']
Token IDs: [2043, 2001, 1045, 2197, 2648, 1029, 1045, 2572, 5881, 2012, 2188, 2005, 1016, 3134, 1012]


(15, 15)

In [50]:
# creating the Xtrain feature vector for train data.
fv = []
for txt in df.tweet:
    tokens = tokenizer.tokenize(txt)
    convert = tokenizer.convert_tokens_to_ids(tokens)
    convert += [0]*(2859-len(convert))
    fv.append(convert)

In [51]:
# Labelling the Ytrain vector for train data.
labels = df.label.values
labels = np.where(labels == 'fake', 0, labels)
labels = np.where(labels == 'real', 1, labels)
labels = list(labels)

In [52]:
# Now,
# Xtrain = fv
# Ytrain = labels

In [53]:
# Validation data.
df_val = pd.read_csv("validation_preprocessed.csv")
df_val.head()

Unnamed: 0,id,tweet,label
0,1,chinese converting islam realising muslim affe...,fake
1,2,NUM NUM people diamond princess cruise ship in...,fake
2,3,covid NUM caused bacterium virus treated aspirin,fake
3,4,mike pence rnc speech praises donald trump cov...,fake
4,5,NUM NUM sky explains latest data government an...,real


In [54]:
# Creating the Xtrain feature vector for Validation data.
fv_val = []
for txt in df_val.tweet:
    tokens = tokenizer.tokenize(txt)
    convert = tokenizer.convert_tokens_to_ids(tokens)
    convert += [0]*(2859-len(convert))
    fv_val.append(convert)

In [55]:
# Labelling the Ytrain vector for validation data.
labels_val = df_val.label.values
labels_val = np.where(labels_val == 'fake', 0, labels_val)
labels_val = np.where(labels_val == 'real', 1, labels_val)
labels_val = list(labels_val)

In [56]:
# Now,
# Xval = fv_val
# Yval = labels_val

In [57]:
# Testing data.
df_test = pd.read_csv("test_data_preprocessed.csv")
df_test.head()

Unnamed: 0,id,tweet
0,1,daily update published states reported NUMk te...
1,2,alfalfa cure covid NUM
2,3,president trump asked would catch coronavirus URL
3,4,states reported NUM deaths still seeing solid ...
4,5,sixth time global health emergency declared in...


In [58]:
# Creating the Xtrain feature vector for test data.
fv_test = []
for txt in df_test.tweet:
    tokens = tokenizer.tokenize(txt)
    convert = tokenizer.convert_tokens_to_ids(tokens)
    convert += [0]*(2859-len(convert))
    fv_test.append(convert)

In [59]:
# small test for feature vector.
for i in range(0, len(fv)):
    if len(fv[i])!=2859:
        print("ohh shit!!", i)

In [60]:
# Model creation is started.

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf = rf.fit(fv, labels)
y_pred = rf.predict(fv_val)
print(classification_report(labels_val, y_pred))

# from sklearn.ensemble import AdaBoostClassifier
# clf = AdaBoostClassifier()
# clf = clf.fit(fv, labels)
# y_pred = clf.predict(fv_test)
# print(classification_report(labels_test, y_pred))

# from xgboost import XGBClassifier
# xgb = XGBClassifier()
# xgb = xgb.fit(np.array(fv), labels)
# y_pred3 = xgb.predict(np.array(fv_test))
# print(classification_report(labels_test, y_pred1))

# from sklearn.neighbors import KNeighborsClassifier
# clf = KNeighborsClassifier(n_neighbors=3)
# clf = clf.fit(fv, labels)
# y_predict = clf.predict(fv_test)
# print(classification_report(labels_test, y_pred1))

# from sklearn import svm
# clf = svm.SVC()
# clf = clf.fit(fv, labels)
# y_predict = clf.predict(fv_test)
# print(classification_report(labels_test, y_pred1))

              precision    recall  f1-score   support

           0       0.79      0.69      0.74      1019
           1       0.75      0.83      0.79      1120

    accuracy                           0.77      2139
   macro avg       0.77      0.76      0.76      2139
weighted avg       0.77      0.77      0.77      2139



In [24]:
np.savetxt("fv_train_squeezeBERT.csv", fv, delimiter=",")

In [25]:
np.savetxt("fv_val_squeezeBERT.csv", fv_val, delimiter=",")

In [26]:
np.savetxt("fv_test_squeezeBERT.csv", fv_test, delimiter=",")