## 1. English: 

### 1. Load dataset

In [1]:
import pandas as pd
train = pd.read_csv('training.tsv', sep='\t')
dev = pd.read_csv('dev.tsv', sep='\t')

In [2]:
train.loc[:3]

Unnamed: 0,topic_id,tweet_id,tweet_url,tweet_text,claim,claim_worthiness
0,covid-19,1234964653014384644,https://twitter.com/EricTrump/status/123496465...,Since this will never get reported by the medi...,1,1
1,covid-19,1234869939720216578,https://twitter.com/RealJamesWoods/status/1234...,"Thanks, #MichaelBloomberg. Here’s a handy litt...",0,0
2,covid-19,1234873136304267267,https://twitter.com/hayxsmith/status/123487313...,"Folks, when you say ""The corona virus isn't a ...",0,0
3,covid-19,1235071285027147776,https://twitter.com/ipspankajnain/status/12350...,Just 1 case of Corona Virus in India and peop...,1,1


### 2. Text pre-processing: tokenisation, POS, lemmanisation, stop words, punctuation, emoji, lower case.

In [3]:
from nltk.stem.lancaster import LancasterStemmer
from nltk import word_tokenize,pos_tag
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords 
import emoji

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhangyingji/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

In [5]:
def text_processor(df):
    for i in range(len(df)):
        # 1. tokenisation.
        token_word = word_tokenize(df.loc[i, 'tweet_text'])
        # 2. POS.
        token_words = pos_tag(token_word)
        # 3. WordNetLemmatizer
        words_lematizer = []
        wordnet_lematizer = WordNetLemmatizer()
        for word, tag in token_words:
            if tag.startswith('NN'):
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='n')  
            elif tag.startswith('VB'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='v')  
            elif tag.startswith('JJ'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='a')  
            elif tag.startswith('R'): 
                word_lematizer =  wordnet_lematizer.lemmatize(word, pos='r')
            else: 
                word_lematizer =  wordnet_lematizer.lemmatize(word)
            words_lematizer.append(word_lematizer)
        # 4. remove stop words.
        cleaned_words = [word for word in words_lematizer if word not in stopwords.words('english')]
        # 5. remove punctuation.
        characters = [',', '.','DBSCAN', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','-','...','^','{','}']
        words_list = [word for word in cleaned_words if word not in characters]
        # 6. lower case.
        words_lists = [x.lower() for x in words_list ]
        # to string.
        text = str()
        for w in words_lists:
            text+=w
            text+=' '
        df.loc[i, 'tweet_text'] = remove_emoji(text)
    return df

In [6]:
train = text_processor(train)
dev = text_processor(dev)

In [7]:
train.loc[:3]

Unnamed: 0,topic_id,tweet_id,tweet_url,tweet_text,claim,claim_worthiness
0,covid-19,1234964653014384644,https://twitter.com/EricTrump/status/123496465...,since never get report medium i want share cop...,1,1
1,covid-19,1234869939720216578,https://twitter.com/RealJamesWoods/status/1234...,thanks michaelbloomberg here ’ handy little un...,0,0
2,covid-19,1234873136304267267,https://twitter.com/hayxsmith/status/123487313...,folks say `` the corona virus n't big deal kil...,0,0
3,covid-19,1235071285027147776,https://twitter.com/ipspankajnain/status/12350...,just 1 case corona virus india people crazy ma...,1,1


In [8]:
dev.loc[:3]

Unnamed: 0,topic_id,tweet_id,tweet_url,tweet_text,claim,claim_worthiness
0,covid-19,1235714275752267776,https://twitter.com/julialindau/status/1235714...,i land jfk report coronavirus milan lombardy —...,1,1
1,covid-19,1235256530728972290,https://twitter.com/stayfrea_/status/123525653...,alert️️️ the corona virus spread money if mone...,1,0
2,covid-19,1235648554338791427,https://twitter.com/A6Asap/status/123564855433...,covid-19 health advice️ http //t.co/xssao52smu,0,0
3,covid-19,1235674258858061825,https://twitter.com/DrDenaGrayson/status/12356...,️chinese doctor say autopsy coronavirus victim...,1,1


### 3. Extract N-gram based on tf.idf

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

count_vec = CountVectorizer(ngram_range=(1, 3), max_features=100)
tf = count_vec.fit_transform(train['tweet_text'].values.astype('U')).toarray()
vocab = count_vec.get_feature_names()

# td.idf
tfidf_vec = TfidfVectorizer(ngram_range=(1, 3), min_df=1, vocabulary=vocab)
tfidf= tfidf_vec.fit_transform(train['tweet_text'].values.astype('U')).toarray()

In [10]:
train_eng_tf = pd.DataFrame(tf, columns=vocab)
train_eng_tf.to_csv("English_ngram_train_tf.csv")
train_eng_tfidf = pd.DataFrame(tfidf, columns=vocab)
train_eng_tfidf.to_csv("English_ngram_train_tfidf.csv")

In [11]:
train_eng_tf.loc[:2]

Unnamed: 0,000,19,also,amp,and,breaking,call,care,case,china,...,us,virus,want,we,week,who,work,world,would,year
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [12]:
count_vec = CountVectorizer(ngram_range=(1, 3), max_features=100)
tf = count_vec.fit_transform(dev['tweet_text'].values.astype('U')).toarray()
vocab = count_vec.get_feature_names()

tfidf_vec = TfidfVectorizer(ngram_range=(1, 3), min_df=1, vocabulary=vocab)
tfidf= tfidf_vec.fit_transform(dev['tweet_text'].values.astype('U')).toarray()

In [13]:
dev_eng_tf = pd.DataFrame(tf, columns=vocab)
dev_eng_tf.to_csv("English_ngram_dev_tfidf.csv")
dev_eng_tfidf = pd.DataFrame(tfidf, columns=vocab)
dev_eng_tfidf.to_csv("English_ngram_dev_tfidf.csv")

### 4. classifier:

In [14]:
import numpy as np

train_tfidf_X = np.array(train_eng_tfidf)
test_tfidf_X = np.array(dev_eng_tfidf)

train_y = np.array(train['claim_worthiness'])
test_y = np.array(dev['claim_worthiness'])

In [15]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(train_tfidf_X, train_y)
print("tfidf score: ", clf.score(test_tfidf_X, test_y))

tfidf score:  0.48




In [16]:
from sklearn.svm import SVC

clf_tfidf = SVC(gamma='auto').fit(train_tfidf_X, train_y)
print("tfidf score: ", clf_tfidf.score(test_tfidf_X, test_y))

tfidf score:  0.52


In [17]:
from sklearn.neighbors import KNeighborsClassifier

neigh_tfidf = KNeighborsClassifier(n_neighbors=4).fit(train_tfidf_X, train_y)
print("tfidf score: ", neigh_tfidf.score(test_tfidf_X, test_y))

tfidf score:  0.49333333333333335


### 5. word embedding: mean & min & max using pre-trained word2vec model

In [18]:
from gensim.models import KeyedVectors
# need to download GoogleNews-vectors-negative300.bin. 
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [19]:
train_mean_X = list()
dev_mean_X = list()

for i in range(len(train)):
    train_text = [model[x] for x in train.loc[i, 'tweet_text'].split(' ') if x in model]
    vector = np.array(train_text).mean(axis=0)
    train_mean_X.append(vector)
    
for i in range(len(dev)):
    dev_text = [model[x] for x in dev.loc[i, 'tweet_text'].split(' ') if x in model]
    vector = np.array(dev_text).mean(axis=0)
    dev_mean_X.append(vector) 
    
train_mean_X = np.array(train_mean_X)
dev_mean_X = np.array(dev_mean_X)

In [20]:
train_max_X = list()
dev_max_X = list()

for i in range(len(train)):
    train_text = [model[x] for x in train.loc[i, 'tweet_text'].split(' ') if x in model]
    vector = np.array(train_text).max(axis=0)
    train_max_X.append(vector)
    
for i in range(len(dev)):
    dev_text = [model[x] for x in dev.loc[i, 'tweet_text'].split(' ') if x in model]
    vector = np.array(dev_text).max(axis=0)
    dev_max_X.append(vector)
    
train_max_X = np.array(train_max_X)
dev_max_X = np.array(dev_max_X)

In [21]:
train_min_X = list()
dev_min_X = list()

for i in range(len(train)):
    train_text = [model[x] for x in train.loc[i, 'tweet_text'].split(' ') if x in model]
    vector = np.array(train_text).min(axis=0)
    train_min_X.append(vector)
    
for i in range(len(dev)):
    dev_text = [model[x] for x in dev.loc[i, 'tweet_text'].split(' ') if x in model]
    vector = np.array(dev_text).min(axis=0)
    dev_min_X.append(vector)
    
train_min_X = np.array(train_min_X)
dev_min_X = np.array(dev_min_X)

In [22]:
print(train_mean_X.shape)
print(dev_mean_X.shape)

(487, 300)
(150, 300)


### 6. classifier: logistic regression & random Forest

In [23]:
from sklearn.linear_model import LogisticRegression

clf_mean = LogisticRegression(random_state=0, solver='lbfgs').fit(train_mean_X, train_y)
print("mean score: ", clf_mean.score(dev_mean_X, test_y))

clf_max = LogisticRegression(random_state=0, solver='lbfgs').fit(train_max_X, train_y)
print("max score: ", clf_max.score(dev_max_X, test_y))

clf_min = LogisticRegression(random_state=0, solver='lbfgs').fit(train_min_X, train_y)
print("min score: ", clf_min.score(dev_min_X, test_y))

mean score:  0.7
max score:  0.7
min score:  0.66


In [34]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200)
rf.fit(train_mean_X, train_y)
preds_proba = rf.predict_proba(dev_mean_X)

In [35]:
results = pd.DataFrame(columns=['topic_id', 'tweet_id', 'score', 'run_id'])
results['tweet_id'] = list(dev['tweet_id'])
results['score'] = [x[1] for x in preds_proba]
results['topic_id'] = 'covid-19'
results['run_id'] = 'Model_1'

results.loc[:10]

Unnamed: 0,topic_id,tweet_id,score,run_id
0,covid-19,1235714275752267776,0.34,Model_1
1,covid-19,1235256530728972290,0.435,Model_1
2,covid-19,1235648554338791427,0.2,Model_1
3,covid-19,1235674258858061825,0.57,Model_1
4,covid-19,1235663306246860800,0.555,Model_1
5,covid-19,1235436227140055040,0.65,Model_1
6,covid-19,1235602629247537154,0.16,Model_1
7,covid-19,1235566351093137408,0.175,Model_1
8,covid-19,1235620307534258176,0.365,Model_1
9,covid-19,1235758466784014337,0.68,Model_1


In [36]:
results.to_csv('golf_system_results_1.tsv', sep='\t', header=False, index=False)

### mean:

INFO : ======================== RESULTS for golf_system_results_1.tsv ==========================

INFO : AVERAGE PRECISION:            0.7929    
INFO : ================================================================================

INFO : RECIPROCAL RANK:              1.0000    
INFO : ================================================================================

INFO : R-PRECISION (R=72):           0.7083    
INFO : ================================================================================

INFO : PRECISION@N:                  @1        @3        @5        @10       @20       @50       
INFO :                               1.0000    1.0000    1.0000    1.0000    0.9500    0.7600    
INFO : ================================================================================
