# Rumor Detection

requirements: `scikit-learn`, `nltk`, `pandas`, `matplotlib`, `numpy`

## Prepare the dataset

dataset is extracted from twitter.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

def get_dataset(xpath="source_tweets.txt", ypath="label.txt"):
    raw_data = {'label':[], 'id':[]}
    with open(ypath, 'r') as f:
        tmp = f.readlines()
        for _t in tmp:
            L, I = _t.split(':')
            raw_data['label'].append(L.strip())
            raw_data['id'].append(I.strip())
    raw_data = pd.DataFrame({'label':raw_data['label']}, index=raw_data['id'])
    
    raw_data['text'] = ''
    with open(xpath, 'r', encoding='utf-8') as f:
        tmp = f.readlines()
        for _t in tmp:
            I, T = _t.split('\t')
            raw_data['text'].loc[I.strip()] = T.strip()
            
    #將 unverified、true、false 視為同一類
    raw_data['label'] = raw_data['label'].apply(lambda x: 0 if 'non' in x else 1)
    
    X_train, X_test, y_train, y_test = train_test_split(raw_data['text'], raw_data['label'], test_size=0.25, random_state=666)
        
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = get_dataset()

## Feature Engineering & Feature Extraction

Feature Engineering Methods:
  1. drop non-english and emoji characters (ascii code > 127)
  2. escape stop words (NLTK stopwords corpus)
  3. Maximum Absolute Scaler Transforming after vectorized

In [2]:
def clean(x):
    drop_list = []
    for i,_x in enumerate(x):
        if ord(_x) >= 128:
            drop_list.append(i)
    new_x = []
    for i,_x in enumerate(x):
        if i not in drop_list:
            new_x.append(_x)
    return ''.join(new_x)

X_train = X_train.apply(clean)
X_test = X_test.apply(clean)

Feature Extraction Method: TF-IDF Vectorize

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

vector_transformer = TfidfVectorizer(stop_words=list(stopwords.words('english')), lowercase=True)
training_vector = vector_transformer.fit_transform(X_train)
testing_vector = vector_transformer.transform(X_test)

scaler_transformer = MaxAbsScaler()
training_vector = scaler_transformer.fit_transform(training_vector)
testing_vector = scaler_transformer.transform(testing_vector)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RainBoltz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Train a model

In [4]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(training_vector, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## Test the model

In [5]:
from sklearn.metrics import classification_report

pred = clf.predict(testing_vector)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      0.52      0.65       100
           1       0.85      0.97      0.90       273

    accuracy                           0.85       373
   macro avg       0.86      0.75      0.78       373
weighted avg       0.85      0.85      0.84       373



In [8]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

names = ["Decision Tree", "Random Forest", "Neural Net", "AdaBoost",'Logistic Regression', 'SVM Classifier']

classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=1000),
    MLPClassifier(),
    AdaBoostClassifier(),
    LogisticRegression(solver='lbfgs'),
    SVC(gamma='scale')
]

for i,clf in enumerate(classifiers):
    clf.fit(training_vector, y_train)
    pred = clf.predict(testing_vector)
    print(names[i])
    print(classification_report(y_test, pred))
    print('\n\n')

Decision Tree
              precision    recall  f1-score   support

           0       0.73      0.44      0.55       100
           1       0.82      0.94      0.88       273

    accuracy                           0.81       373
   macro avg       0.78      0.69      0.71       373
weighted avg       0.80      0.81      0.79       373




Random Forest
              precision    recall  f1-score   support

           0       1.00      0.24      0.39       100
           1       0.78      1.00      0.88       273

    accuracy                           0.80       373
   macro avg       0.89      0.62      0.63       373
weighted avg       0.84      0.80      0.75       373




Neural Net
              precision    recall  f1-score   support

           0       0.74      0.58      0.65       100
           1       0.86      0.93      0.89       273

    accuracy                           0.83       373
   macro avg       0.80      0.75      0.77       373
weighted avg       0.83      