## Final System 1 - Random Forest w/ TFIDF on V2 Data

### 1. Preprocessing

Read in data:

In [78]:
import pandas as pd
import numpy as np
import warnings
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import precision_score, recall_score, average_precision_score

warnings.filterwarnings('ignore') 
np.random.seed(42)

train = pd.read_csv('final_data/training_v2.tsv', sep='\t', header=0, index_col='tweet_id').drop(['tweet_url', 'topic_id'], axis=1)
dev = pd.read_csv('final_data/dev_v2.tsv', sep='\t', header=0, index_col='tweet_id').drop(['tweet_url', 'topic_id'], axis=1)
test = pd.read_csv('final_data/test-input.tsv', sep='\t', header=0, index_col='tweet_id').drop(['tweet_url', 'topic_id'], axis=1)
train = train.append(dev, ignore_index=True)
print(len(train), 'training instances')
print(len(test), 'test instances')
train.head()

822 training instances
140 test instances


Unnamed: 0,tweet_text,claim,check_worthiness
0,Since this will never get reported by the medi...,1,1
1,"Thanks, #MichaelBloomberg. Here’s a handy litt...",0,0
2,"Folks, when you say ""The corona virus isn't a ...",0,0
3,Just 1 case of Corona Virus in India and peop...,1,0
4,President @realDonaldTrump made a commitment...,1,1


Use functionality from Arabic task to preprocess the tweets:

In [79]:
nltk.download('stopwords')

# before using nltk.corpus.stopwords, following the cell above to download stopwords
stopwords = stopwords.words('english')
url_pattern = r"https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}"
token_pattern = r"\b[A-Za-z][A-Za-z]+\b"

# function of preprocessing text, tokenization, stopwords removal, stemming
def preprocess_text(text, url_pattern = url_pattern, token_pattern=token_pattern, 
                    with_urlrm=True, with_stopwordsrm=True, stopwords=stopwords, with_stemming=False):
    # url removal 
    if with_urlrm == True:
        text = re.sub(url_pattern, "", text)
        
    # lower case 
    text_lower = text.lower()
    
    # tokenization 
    words = re.findall(token_pattern, text_lower)
    
    # stopwords removal
    if with_stopwordsrm == True:
        words = [word for word in words if word not in stopwords]
        
    # stemming 
    if with_stemming == True:
        ps = PorterStemmer() 
        words = [ps.stem(word) for word in words]
        
    text_processed = " ".join(words)
    
    return text_processed

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tommcdonald/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [80]:
train_with_stemming = []
test_with_stemming = []

for i, text in enumerate(train["tweet_text"]):
    train_with_stemming.append(preprocess_text(text, with_stemming=True))
    if i < len(test):
        test_with_stemming.append(preprocess_text(test["tweet_text"].iloc[i], with_stemming=True))
    
train["processed_text"] = train_with_stemming
test["processed_text"] = test_with_stemming

### 2. TFIDF Features

Create TFIDF vectors from processed text, using unigrams, bigrams and trigrams:

In [81]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X_tr = vectorizer.fit_transform(train_with_stemming)
X_test = vectorizer.transform(test_with_stemming)

X_tr_df = pd.DataFrame.sparse.from_spmatrix(X_tr, 
                                            columns=vectorizer.get_feature_names())
X_test_df = pd.DataFrame.sparse.from_spmatrix(X_test, 
                                             columns=vectorizer.get_feature_names())

X_tr_df.head()

Unnamed: 0,aag,aag badh,aag badh raha,aaron,aaron ramsey,aaron ramsey score,ab,ab cbn,ab cbn news,abandon,...,zero self,zero self awar,zhangyix,zika,zika corona,zika corona elect,zika ebola,zika ebola lyme,zombi,zombi apocalyps
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. Modelling

Get tweet labels and use them with the training data to fit some models. 'claim' is 1 if the tweet is a claim and 0 otherwise, whilst 'worthy' is 1 if the tweet is worth fact checking and 0 otherwise.

In [82]:
y_tr_claim = train.claim.to_numpy().reshape(-1, 1)
y_tr_worthy = train.check_worthiness.to_numpy().reshape(-1, 1)

Train on full training set using best model (Random Forest) and predict probabilities of check-worthiness for each tweet.

In [83]:
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_tr_df, y_tr_worthy)
preds_proba = rf.predict_proba(X_test_df)

Create correct format for data as specified in README.md and use 'scorer' to compare the system to the gold standard data.

In [87]:
results = pd.DataFrame(columns=['topic_id', 'tweet_id', 'score', 'run_id'])
results['tweet_id'] = list(test.index)
results['score'] = [x[1] for x in preds_proba]
results['topic_id'] = 'covid-19'
results['run_id'] = 'TeamGolfModel1'
results = results.sort_values(['score'], ascending=False)
#results['rank'] = [x for x in range(1, len(test)+1)]

In [88]:
results

Unnamed: 0,topic_id,tweet_id,score,run_id
53,covid-19,1237435123072667649,0.715,TeamGolfModel1
113,covid-19,1236987957405536256,0.645,TeamGolfModel1
77,covid-19,1237216806068051968,0.640,TeamGolfModel1
116,covid-19,1237182053843689474,0.625,TeamGolfModel1
34,covid-19,1237512557088260097,0.610,TeamGolfModel1
...,...,...,...,...
33,covid-19,1236911907027791873,0.010,TeamGolfModel1
66,covid-19,1236859724353716224,0.005,TeamGolfModel1
132,covid-19,1237384116141768704,0.000,TeamGolfModel1
80,covid-19,1237179694815969280,0.000,TeamGolfModel1


In [89]:
results.to_csv('final_results/system_1_results.tsv', sep='\t', header=False, index=False)