## Task 1 (English) System \#1

The baseline provided by CLEF achieves the following average precision:

In [2]:
%run baselines/baselines.py

Random Baseline AVGP: 0.4378835232034028
Ngram Baseline AVGP: 0.8178620442105289


### 1. Preprocessing

Read in data:

In [32]:
import pandas as pd
import numpy as np
import warnings
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import precision_score, recall_score, average_precision_score

warnings.filterwarnings('ignore') 
np.random.seed(42)

train = pd.read_csv('data/training.tsv', sep='\t', header=0, index_col='tweet_id').drop(['tweet_url', 'topic_id'], axis=1)
dev = pd.read_csv('data/dev.tsv', sep='\t', header=0, index_col='tweet_id').drop(['tweet_url', 'topic_id'], axis=1)
train.head()

Unnamed: 0_level_0,tweet_text,claim,claim_worthiness
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1234964653014384644,Since this will never get reported by the medi...,1,1
1234869939720216578,"Thanks, #MichaelBloomberg. Here’s a handy litt...",0,0
1234873136304267267,"Folks, when you say ""The corona virus isn't a ...",0,0
1235071285027147776,Just 1 case of Corona Virus in India and peop...,1,1
1234911110861594624,President @realDonaldTrump made a commitment...,1,1


Use functionality from Arabic task to preprocess the tweets:

In [33]:
nltk.download('stopwords')

# before using nltk.corpus.stopwords, following the cell above to download stopwords
stopwords = stopwords.words('english')
url_pattern = r"https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}"
token_pattern = r"\b[A-Za-z][A-Za-z]+\b"

# function of preprocessing text, tokenization, stopwords removal, stemming
def preprocess_text(text, url_pattern = url_pattern, token_pattern=token_pattern, 
                    with_urlrm=True, with_stopwordsrm=True, stopwords=stopwords, with_stemming=False):
    # url removal 
    if with_urlrm == True:
        text = re.sub(url_pattern, "", text)
        
    # lower case 
    text_lower = text.lower()
    
    # tokenization 
    words = re.findall(token_pattern, text_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    text_processed = " ".join(words)
    
    return text_processed

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tommcdonald/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
train_with_stemming = []
dev_with_stemming = []

for i, text in enumerate(train["tweet_text"]):
    train_with_stemming.append(preprocess_text(text, with_stemming=True))
    if i < len(dev):
        dev_with_stemming.append(preprocess_text(dev["tweet_text"].iloc[i], with_stemming=True))
    
train["processed_text"] = train_with_stemming
dev["processed_text"] = dev_with_stemming

### 2. TFIDF Features

Create TFIDF vectors from processed text, using unigrams, bigrams and trigrams:

In [37]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X_tr = vectorizer.fit_transform(train_with_stemming)
X_dev = vectorizer.transform(dev_with_stemming)

X_tr_df = pd.DataFrame.sparse.from_spmatrix(X_tr, 
                                            columns=vectorizer.get_feature_names())
X_dev_df = pd.DataFrame.sparse.from_spmatrix(X_dev, 
                                             columns=vectorizer.get_feature_names())

X_tr_df.head()

Unnamed: 0,aag,aag badh,aag badh raha,aaron,aaron ramsey,aaron ramsey score,ab,ab cbn,ab cbn news,abandon,...,zero children,zero children year,zero school,zero school close,zhangyix,zika,zika corona,zika corona elect,zika ebola,zika ebola lyme
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. Modelling

Get tweet labels and use them with the training data to fit some models. 'claim' is 1 if the tweet is a claim and 0 otherwise, whilst 'worthy' is 1 if the tweet is worth fact checking and 0 otherwise.

In [40]:
y_tr_claim = train.claim.to_numpy().reshape(-1, 1)
y_tr_worthy = train.claim_worthiness.to_numpy().reshape(-1, 1)
y_dev_claim = train.claim.to_numpy().reshape(-1, 1)
y_dev_worthy = train.claim_worthiness.to_numpy().reshape(-1, 1)

In [55]:
models = [GaussianNB(), RandomForestClassifier(n_estimators=100), SGDClassifier(), 
          AdaBoostClassifier(), GradientBoostingClassifier()]
models_str = ["Naive Bayes", "Random Forest", "SGD", "AdaBoost", "Gradient Boosting"]
best_models = []

for i, model in enumerate(models):
    scoring = {'map':'average_precision'}
    scores = cross_validate(model, X_tr_df, y_tr_worthy, scoring=scoring, cv=5, 
                            return_train_score=True, return_estimator=True)
    best_models.append(scores["estimator"][np.argmax(scores["test_map"])])
    print(models_str[i])
    print("Test MAP Score:", round(np.mean(scores['test_map']), 3), '\n')

Naive Bayes
Test MAP Score: 0.537 

Random Forest
Test MAP Score: 0.77 

SGD
Test MAP Score: 0.755 

AdaBoost
Test MAP Score: 0.626 

Gradient Boosting
Test MAP Score: 0.724 



Train on full training set using best model (Random Forest) and predict probabilities of check-worthiness for each tweet.

In [198]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_tr_df, y_tr_worthy)
preds_proba = rf.predict_proba(X_dev_df)

Create correct format for data as specified in README.md and use 'scorer' to compare the system to the gold standard data.

In [199]:
results = pd.DataFrame(columns=['topic_id', 'tweet_id', 'score', 'run_id'])
results['tweet_id'] = list(dev.index)
results['score'] = [x[1] for x in preds_proba]
results['topic_id'] = 'covid-19'
results['run_id'] = 'Model_1'

In [200]:
results

Unnamed: 0,topic_id,tweet_id,score,run_id
0,covid-19,1235714275752267776,0.575,Model_1
1,covid-19,1235256530728972290,0.180,Model_1
2,covid-19,1235648554338791427,0.270,Model_1
3,covid-19,1235674258858061825,0.425,Model_1
4,covid-19,1235663306246860800,0.330,Model_1
...,...,...,...,...
145,covid-19,1235914080931766274,0.150,Model_1
146,covid-19,1235770706765451264,0.200,Model_1
147,covid-19,1235973416995315712,0.475,Model_1
148,covid-19,1235675024738185239,0.165,Model_1


In [201]:
results.to_csv('golf_system_results.tsv', sep='\t', header=False, index=False)

Average precision of 0.8104 is just slightly below CLEF's 0.817 n-gram model baseline, and is considerably better than the random basline of ~ 0.4.