# Module imports

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 6.7MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 26.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 38.5MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC,LinearSVC
from collections import Counter

import numpy as np
import pandas as pd
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')
import json

In [None]:
# reference: https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/
# reference: https://imbalanced-learn.org/stable/auto_examples/combine/plot_comparison_combine.html#sphx-glr-auto-examples-combine-plot-comparison-combine-py
# the imports for dealing with the imbalance dataset
import re
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours 
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from numpy import mean
# display all information in the dataframe
pd.set_option('display.max_colwidth', None)

In [None]:
# customed package for exploring and cleaning the tweet data
!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall
import preprocess_kgptalkie as kgp

Collecting git+https://github.com/laxmimerit/preprocess_kgptalkie.git
  Cloning https://github.com/laxmimerit/preprocess_kgptalkie.git to /tmp/pip-req-build-nbjjzkc6
  Running command git clone -q https://github.com/laxmimerit/preprocess_kgptalkie.git /tmp/pip-req-build-nbjjzkc6
Building wheels for collected packages: preprocess-kgptalkie
  Building wheel for preprocess-kgptalkie (setup.py) ... [?25l[?25hdone
  Created wheel for preprocess-kgptalkie: filename=preprocess_kgptalkie-0.1.3-cp37-none-any.whl size=11743 sha256=42208973264e65ffba99b381ec49215d96b5b2ea6b1a693a73cc5e932940c590
  Stored in directory: /tmp/pip-ephem-wheel-cache-fk275u9o/wheels/a8/18/22/90afa4bd43247fb9a75b710a4a3fcd94966c022ce9e3c7d0a6
Successfully built preprocess-kgptalkie
Installing collected packages: preprocess-kgptalkie
Successfully installed preprocess-kgptalkie-0.1.3


# Data loading

In [None]:
def load_sentence(file):
    with open(file) as f:
        df = pd.DataFrame(json.loads(line)[0] for line in f)
    return df

def load_label(file):
    with open(file) as f:
        data = json.load(f)
        df_label = pd.DataFrame(list(data.items()),columns=['id_str','rumor_or_not'])
    return df_label

In [None]:
# loading data
df_train = load_sentence('gdrive/MyDrive/NLP-project1/train.data.jsonl')
df_label = load_label('gdrive/MyDrive/NLP-project1/train.label.json')
df_full_train = pd.merge(df_train,df_label,on=['id_str'])

# expected to analyze the text and if it's the rumor
df_train_data = df_full_train[['text','rumor_or_not']]

In [None]:
# loading validation data
df_dev = load_sentence('gdrive/MyDrive/NLP-project1/dev.data.jsonl')
df_label_dev = load_label('gdrive/MyDrive/NLP-project1/dev.label.json')

df_full_dev = pd.merge(df_dev,df_label_dev,on=['id_str'])
df_dev_data = df_full_dev[['text','rumor_or_not']]

labels_dev = df_dev_data['rumor_or_not']

In [None]:
df_test = load_sentence('gdrive/MyDrive/NLP-project1/test.data.jsonl')
df_test_data = df_test[['text']]

In [None]:
df_train_data['text'].head(10)

0          How to respond to the murderous attack on Charlie Hebdo? Every newspaper in the free world should print this. http://t.co/sC2ot63F6j
1            You can't condemn an entire race, nation or religion based on the actions of a few radicals, please keep that in mind #sydneysiege
2    Attempts to extend blame for this to all Muslims should be treated with the same disgust as attempts to justify the attacks. #CharlieHebdo
3                            Rest in Peace, Cpl. Nathan Cirillo. Killed today in #OttawaShooting\nhttp://t.co/YzLXYX5JJt http://t.co/8F0qAcj9sg
4     People DEBATING whether #MikeBrown shoplifted or not-- IT DOESN'T MATTER.\nShoplifting isn't punishable by DEATH IN THE STREET. #Ferguson
5                                                                        Update - PA: gunman holding hostages in #Paris grocery has been killed
6     Here's a recap of the key points so far in the #GermanWings Alps plane crash. Live updates: http://t.co/8UPMsinQkX http://t.co/hNj

In [None]:
def normalize_user(x):
  x = str(x)
  for token in x.split():
    if token.startswith('@'):
        x = x.replace(token,'@USER')
  return x
def remove_symbol(x):
  x = str(x)
  filtered_list = []
  for token in x.split():
    filtered_list.append(token.replace("\n","").replace("#",""))
  x = filtered_list
  x = ' '.join(x)
  return x

def uppercase_httpurl(x):
  x = str(x)
  for token in x.split():
    if token.startswith('http'):
        x = x.replace(token,'HTTPURL')
  return x
# reference: https://stackoverflow.com/questions/258390/python-filter-remove-urls-from-a-list
def get_data_without_urls(text):
    return re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "HTTPURL", text)

In [None]:
def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ').replace('.', ' ').replace('!', ' ').replace('-', ' ').replace(':', ' ').replace(',', ' ').replace('?', ' ')
    x = kgp.cont_exp(x) #you're -> you are; i'm -> i am
    x = kgp.remove_emails(x)
    x = normalize_user(x)
    x = uppercase_httpurl(x)
    x = remove_symbol(x)
    x = kgp.remove_rt(x)
    x = kgp.remove_accented_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x) # 'lllooooovvveeee youuuu' -> I love you
    return x

In [None]:
# replace url with httpurl, replace tagging to user
df_train_data['text'] = df_train_data['text'].apply(get_data_without_urls)
df_train_data['text'] = df_train_data['text'].apply(lambda x: get_clean(x))

df_dev_data['text'] = df_dev_data['text'].apply(get_data_without_urls)
df_dev_data['text'] = df_dev_data['text'].apply(lambda x: get_clean(x))

df_test['text'] = df_test['text'].apply(get_data_without_urls)
df_test['text'] = df_test['text'].apply(lambda x: get_clean(x))

In [None]:
df_train_data['text'].head(10)

0                                How to respond to the murderous attack on Charlie Hebdo? Every newspaper in the free world should print this. 
1           You cannot condemn an entire race  nation or religion based on the actions of a few radicals  please keep that in mind #sydneysiege
2    Attempts to extend blame for this to all Muslims should be treated with the same disgust as attempts to justify the attacks. #CharlieHebdo
3                                                                        Rest in Peace  Cpl. Nathan Cirillo. Killed today in #OttawaShooting\n 
4    People DEBATING whether #MikeBrown shoplifted or not   IT DOESN'T MATTER.\nShoplifting is not punishable by DEATH IN THE STREET. #Ferguson
5                                                                        Update   PA  gunman holding hostages in #Paris grocery has been killed
6                                                 Here's a recap of the key points so far in the #GermanWings Alps plane crash. Live upd

# Model loading

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
# model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [None]:
def get_feature_vector(dataframe_column):
    # add special tokens with [CLS], [SEP] and tokenlized the sentence
    tokenized = dataframe_column.apply(lambda x: tokenizer.encode(str(x), add_special_tokens=True))
    # get the max words which can be used for
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    # masking
    attention_mask = np.where(padded != 0, 1, 0)

    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)

    features = last_hidden_states[0][:,0,:].numpy()
    return  features

In [None]:
features = get_feature_vector(df_train_data['text'])
features_dev = get_feature_vector(df_dev_data['text'])
features.shape

(4641, 768)

In [None]:
labels = df_train_data['rumor_or_not']
labels_dev = df_dev_data['rumor_or_not']
# labels = df_train_data['rumour']

In [None]:
train_features, test_features, train_labels, test_labels = features, features_dev, labels, labels_dev

# Data resampling

In [None]:
# resampling the minority class for imbalanced dataset -> improve F1 score
resample = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='not majority'))
X, y = resample.fit_resample(train_features, train_labels)

In [None]:
# the counter that show the result of resampling

from collections import Counter
print(f"counts before resample: {Counter(train_labels)}")
print(f"counts after resample: {Counter(y)}")

Counter({'non-rumour': 3058, 'rumour': 2695}) Counter({'non-rumour': 3058, 'rumour': 1583})


# Logistic Regression
- %run logistic_grid.py if using local setting

In [None]:
# import the function and just return the parameter -> succinctly shorten the code in project 1
%run /content/gdrive/MyDrive/NLP-project1/logistic_grid.py

# uncomment below to run locally
# %run logistic_grid.py
c, f1_score_logistic= grid_search_logistic(train_features, train_labels)
print(f1_score_logistic)

0.8198679521918265


In [None]:
lr_clf = LogisticRegression(
    C=c,
    # already grid search the solver and penalty -> search for C only
    solver='lbfgs',
    penalty='l2')
lr_clf.fit(train_features, train_labels)

predictions = lr_clf.predict(test_features)
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

  non-rumour       0.86      0.90      0.88       393
      rumour       0.76      0.71      0.73       187

    accuracy                           0.83       580
   macro avg       0.81      0.80      0.81       580
weighted avg       0.83      0.83      0.83       580



In [None]:
lr_clf = LogisticRegression(
    C=c,
    # already grid search the solver and penalty -> search for C only
    solver='lbfgs',
    penalty='l2')
lr_clf.fit(X, y)

predictions = lr_clf.predict(test_features)
print(classification_report(test_labels,predictions))

In [None]:
# Logistic regression with text-preprocessing
#              precision    recall    f1-score   support

#   non-rumour       0.87      0.92      0.90       393
#     rumour       0.81      0.72      0.76       187

#accuracy                           0.86       580
#   macro avg       0.84      0.82      0.83       580
# weighted avg       0.85      0.86      0.85       580

In [None]:
lr_clf_resampled = LogisticRegression(
    C=c,
    # already grid search the solver and penalty -> search for C only
    solver='lbfgs',
    penalty='l2')
lr_clf_resampled.fit(X, y)

predictions = lr_clf_resampled.predict(test_features)
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

  non-rumour       0.88      0.89      0.89       393
      rumour       0.77      0.74      0.76       187

    accuracy                           0.84       580
   macro avg       0.82      0.82      0.82       580
weighted avg       0.84      0.84      0.84       580



In [None]:
# Logistic regression with raw text
#              precision    recall    f1-score    support

#   non-rumour       0.86      0.90      0.88       393
#     rumour       0.77      0.68      0.72       187

#accuracy                           0.83       580
#   macro avg       0.81      0.79      0.80       580
# weighted avg       0.83      0.83      0.83       580

# KNN
- automatically fit in the best parameters
- %run knn_grid.py if using local setting

In [None]:
%run /content/gdrive/MyDrive/NLP-project1/knn_grid.py
# uncomment the code below to run the file locally
# %run knn_grid.py
# take up to 30 minutes to find out
best_parameter, f1_score = grid_search_knn(train_features,train_labels)

print(best_parameter, f1_score)

{'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'distance'} 0.8205147359043836


In [None]:
# {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'distance'} 0.8205147359043836
# F1_score: 0.8205147359043836
kn_clf = KNeighborsClassifier(
    metric=best_parameter['metric'],
    n_neighbors=best_parameter['n_neighbors'],
    weights=best_parameter['weights']
)
kn_clf.fit(train_features, train_labels)
predictions = kn_clf.predict(test_features)
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

  non-rumour       0.89      0.86      0.87       393
      rumour       0.72      0.78      0.75       187

    accuracy                           0.83       580
   macro avg       0.81      0.82      0.81       580
weighted avg       0.84      0.83      0.83       580



In [None]:
# KNN classifier with text-preprocessing
#             precision    recall      f1-score   support

#   non-rumour       0.89      0.86      0.87       393
#     rumour       0.72      0.78      0.75       187

#   accuracy                        0.83       580
#   macro avg       0.81      0.82      0.81       580
# weighted avg       0.84      0.83      0.83       580

In [None]:
kn_clf_resampled = KNeighborsClassifier(
    metric=best_parameter['metric'],
    n_neighbors=best_parameter['n_neighbors'],
    weights=best_parameter['weights']
)
kn_clf_resampled.fit(X, y)
predictions = kn_clf_resampled.predict(test_features)
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

  non-rumour       0.95      0.73      0.82       393
      rumour       0.61      0.91      0.73       187

    accuracy                           0.79       580
   macro avg       0.78      0.82      0.78       580
weighted avg       0.84      0.79      0.79       580



In [None]:
# KNN classifier with text-preprocessing & resampling
#             precision    recall  f1-score   support
# non-rumour       0.95      0.73      0.82       393
#   rumour       0.61      0.91      0.73       187

#   accuracy                      0.79       580
# macro avg       0.78      0.82      0.78       580
#weighted avg       0.84      0.79      0.79       580

# SVC

In [None]:
%run /content/gdrive/MyDrive/NLP-project1/svc_grid.py
# uncomment the code below to run it locally
# %run svc_grid.py
best_parameters, best_score = grid_search_svc(train_features, train_labels)
print(best_paramters, best_score)

{'C': 10, 'gamma': 'scale', 'kernel': 'poly'} 0.8371060836643036


In [None]:
svc_clf = SVC(
    C=best_parameters['C'],
    gamma=best_parameters['gamma'],
    kernel=best_parameters['kernel']
)
svc_clf.fit(train_features, train_labels)
predictions = svc_clf.predict(test_features)
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

  non-rumour       0.90      0.92      0.91       393
      rumour       0.83      0.79      0.81       187

    accuracy                           0.88       580
   macro avg       0.86      0.85      0.86       580
weighted avg       0.88      0.88      0.88       580



In [None]:
# SVC classifier with text-preprocessing
# precision    recall  f1-score   support

#   non-rumour       0.90      0.92      0.91       393
#     rumour       0.83      0.79      0.81       187

#   accuracy                        0.88       580
#   macro avg       0.86      0.85      0.86       580
# weighted avg       0.88      0.88      0.88       580

In [None]:
svc_clf_resampled = SVC(
    C=best_paramters['C'],
    gamma=best_paramters['gamma'],
    kernel=best_paramters['kernel']
)
svc_clf_resampled.fit(X, y)
predictions = svc_clf_resampled.predict(test_features)
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

  non-rumour       0.91      0.91      0.91       393
      rumour       0.81      0.82      0.81       187

    accuracy                           0.88       580
   macro avg       0.86      0.86      0.86       580
weighted avg       0.88      0.88      0.88       580



In [None]:
# SVC classifier with text-preprocessing & resampling
# precision    recall  f1-score   support

#   non-rumour       0.91      0.91      0.91       393
#     rumour       0.81      0.82      0.81       187

#   accuracy                        0.88       580
#   macro avg       0.86      0.86      0.86       580
# weighted avg       0.88      0.88      0.88       580

In [None]:
# # resampling must be after the data split
# # just for increasing the overall performance, not let the model learn the wrong data
# resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='not minority'))
# steps = [('r', resample) , ('model', svc_clf)]
# pipeline = Pipeline(steps=steps)
# pipeline.fit(train_features,train_labels)

# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# scores = cross_val_score(pipeline, train_features, train_labels, cv=cv, scoring='f1_micro', n_jobs=-1) # roc_auc, f1-micro
# score = mean(scores)
# print('F1 Score: %.3f' % score)

# Random forest 

In [None]:
%run /content/gdrive/MyDrive/NLP-project1/rforest_grid.py
# uncomment below to run it locally
# %run rforest_grid.py

best_parameters, best_score = grid_search_rforest(train_features, train_labels)
print(best_parameters, best_score)

{'max_features': 'sqrt', 'n_estimators': 1000} 0.810816924019153


In [None]:
rforest_clf = RandomForestClassifier(
    n_estimators=best_parameters['n_estimators'],
    max_features=best_parameters['max_features']
)
rforest_clf.fit(train_features, train_labels)
predictions = rforest_clf.predict(test_features)
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

  non-rumour       0.86      0.90      0.88       393
      rumour       0.77      0.68      0.72       187

    accuracy                           0.83       580
   macro avg       0.81      0.79      0.80       580
weighted avg       0.83      0.83      0.83       580



In [None]:
# random forest classifier with text-preprocessing
# precision    recall  f1-score   support

#   non-rumour       0.86      0.90      0.88       393
#     rumour       0.77      0.68      0.72       187

#   accuracy                        0.83       580
#   macro avg       0.81      0.79      0.80       580
# weighted avg       0.83      0.83      0.83       580

In [None]:
rforest_clf_resampled = RandomForestClassifier(
    n_estimators=best_parameters['n_estimators'],
    max_features=best_parameters['max_features']
)
rforest_clf_resampled.fit(X, y)
predictions = rforest_clf_resampled.predict(test_features)
print(classification_report(test_labels,predictions))

              precision    recall  f1-score   support

  non-rumour       0.85      0.90      0.87       393
      rumour       0.75      0.67      0.71       187

    accuracy                           0.82       580
   macro avg       0.80      0.78      0.79       580
weighted avg       0.82      0.82      0.82       580



In [None]:
# random forest classifier with text-preprocessing & resampling
#              precision    recall     f1-score    support

#   non-rumour       0.85      0.90      0.87       393
#     rumour       0.75      0.67      0.71       187

#     accuracy                      0.82       580
#   macro avg       0.80      0.78      0.79       580
# weighted avg       0.82      0.82      0.82       580

# Predict the test data and output call "test-output.json" from "test.data.json"

In [None]:
features_test = get_feature_vector(df_test_data['text'])
# change the classifier here
predictions = svc_clf.predict(features_test)

prediction_series = pd.Series(predictions)
df_prediction = pd.DataFrame(prediction_series,columns=['rumor_or_not'])
df_prediction_with_id = pd.DataFrame(df_test['id_str'])

df_prediction_with_id['rumor_or_not'] = df_prediction['rumor_or_not']
df_prediction_with_id.set_index("id_str",inplace=True)

json = df_prediction_with_id['rumor_or_not'].to_json(path_or_buf="gdrive/MyDrive/NLP-project1/test-output.json")

In [None]:
# Bert model reference
# https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=To9ENLU90WGl
# https://colab.research.google.com/drive/1sfAypJA0r8DEaDmTGWD8FCrvpQZ33TVl?usp=sharing

In [None]:
# imbalance classification
# https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/

# combine  oversampling and undersampling for imbalance classification
# https://machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification/

# hyperparameters tuning for ML algorithms
# https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

# Hyperparameter Optimization With Random Search and Grid Search
# https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/

# SMOTE + ENN : A sampling method that solves modeling with an imbalanced dataset (In Chinese)
# https://medium.com/%E6%95%B8%E5%AD%B8-%E4%BA%BA%E5%B7%A5%E6%99%BA%E6%85%A7%E8%88%87%E8%9F%92%E8%9B%87/smote-enn-%E8%A7%A3%E6%B1%BA%E6%95%B8%E6%93%9A%E4%B8%8D%E5%B9%B3%E8%A1%A1%E5%BB%BA%E6%A8%A1%E7%9A%84%E6%8E%A1%E6%A8%A3%E6%96%B9%E6%B3%95-cdb6324b711e

# SMOTEENN
# https://imbalanced-learn.org/stable/references/generated/imblearn.combine.SMOTEENN.html#imblearn.combine.SMOTEENN.get_params