In [42]:
import pandas as pd
train_file = "train.data.txt"
dev_file = "dev.data.txt"
test_file = "test.data.txt"


train_data = pd.read_csv('./%s.csv'%train_file,keep_default_na=False)
dev_data = pd.read_csv('./%s.csv'%dev_file,keep_default_na=False)
test_data = pd.read_csv('./%s.csv'%test_file,keep_default_na=False)


In [61]:
from transformers import BertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
import re
import scipy
import os
import keras
import string
import tensorflow as tf
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

le = LabelEncoder()
vectorizer = TfidfVectorizer()
def tokenize_tweet(string_data:str):
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenized = nltk.RegexpTokenizer('\w+')
    data = string_data.replace('\n', '')
    data = data.lower()
    data = re.sub('https?://\S+|www\.\S+', '', data)
    data = re.sub('[%s]' % re.escape(string.punctuation), '', data)

    data = tokenized.tokenize(data)
    data = [i for i in data if i not in stopwords]
    data = [wordnet_lemmatizer.lemmatize(word) for word in data]
    data = ' '.join(data)   
    return data
    
def preprocess_token(df, dropNa=True):
    data = df.copy()
    data['main_tweet'] = data['main_tweet'].fillna('')
    if dropNa:
        data.replace('', np.nan, inplace=True)
        data.dropna(subset=['main_tweet'], inplace=True)
    data['main_tweet'] = data['main_tweet'].apply(lambda x: tokenize_tweet(x))
    data['verified'] = data['verified'].astype(int)
    return data
"""
    Preprocessing the training data
"""

train = preprocess_token(train_data)
traintfidf = vectorizer.fit_transform(train['main_tweet'].tolist())
traintfidf = scipy.sparse.csr_matrix(traintfidf).todense()
d = pd.DataFrame(traintfidf)
verified = pd.DataFrame({"verified":train['verified'].tolist()})
followers = pd.DataFrame({"followers":train['followers'].tolist()})
train_label = le.fit_transform(train['label'].tolist())
train = pd.concat([d, verified], axis=1)
train = tf.convert_to_tensor(train)

"""
    Preprocessing the dev data
"""
dev = preprocess_token(dev_data)
devtfidf = vectorizer.transform(dev['main_tweet'].tolist())
devtfidf = scipy.sparse.csr_matrix(devtfidf).todense()
devpd = pd.DataFrame(devtfidf)
dev_label=le.transform(dev['label'].tolist())
dev_verified = pd.DataFrame({"verified":dev['verified'].tolist()})
dev_followers = pd.DataFrame({"followers":dev['followers'].tolist()})
dev = pd.concat([devpd, dev_verified], axis=1)
dev = tf.convert_to_tensor(dev)


"""
    Preprocessing the test data
"""
test = preprocess_token(test_data, dropNa=False)
testtfidf = vectorizer.transform(test['main_tweet'].tolist())
testtfidf = scipy.sparse.csr_matrix(testtfidf).todense()
testpd = pd.DataFrame(testtfidf)
test_verified = pd.DataFrame({"verified":test['verified'].tolist()})
test_followers = pd.DataFrame({"followers":test['followers'].tolist()})
test = pd.concat([testpd, test_verified], axis=1)
test = tf.convert_to_tensor(test)


[nltk_data] Downloading package stopwords to /home/kan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
import spacy
import scattertext as st

nlp = spacy.load("en_core_web_sm")
corpus = st.CorpusFromPandas(train_data,
                             category_col='label',
                             text_col='main_tweet',

                             nlp=nlp).build()
term_freq_df = corpus.get_term_freq_df()
term_freq_df['nonrumour score'] = corpus.get_scaled_f_scores('nonrumour')

print(list(term_freq_df.sort_values(by='nonrumour score', ascending=False).index[:10]))
train_data.iloc[0]
html = st.produce_scattertext_explorer(corpus,\
        category='nonrumour',\
        category_name='Nonrumour',\
        not_category_name='Rumour',\
        width_in_pixels=1000,\
        metadata=train_data['label'])
open("./Convention-Visualization.html", 'wb').write(html.encode('utf-8'))



['new coronaviru', 'prevent treat', 'herd immun', 'disea', 'herd', 'symptom', 'treat', 'effect', 'transmit', 'infect new']


746576

In [9]:
tokenize_tweet("Does the new coronavirus affect older people, or are younger people also susceptible? #Covid_19 #health https://t.co/Kg0Qb5oMs8")

'new coronaviru affect older peopl younger peopl also suscept health'

In [62]:

from tensorflow.keras import layers
model = tf.keras.Sequential([
    layers.Dense(128, activation='relu',input_shape=(train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

history = model.fit(train, train_label,epochs=20, validation_data=(dev, dev_label) ,batch_size=10)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 128)               665728    
                                                                 
 dense_13 (Dense)            (None, 64)                8256      
                                                                 
 dense_14 (Dense)            (None, 32)                2080      
                                                                 
 dense_15 (Dense)            (None, 1)                 33        
                                                                 
Total params: 676,097
Trainable params: 676,097
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/

In [50]:
prediction = model.predict(test)
prediction = (prediction > 0.5).astype("int32")
prediction = np.ndarray.flatten(prediction)
pd.DataFrame({"Predicted":  prediction}).to_csv('submission.csv', index_label="Id")

In [1]:
import os


def update_data(df, filename):
    copy_test = df.copy()
    data_empty_test = copy_test[copy_test['main_tweet'].isna()]
    listid = data_empty_test['main_tweet_id'].tolist()
    print(len(listid))
    i = 0
    for id in listid:
        file = './project-data/tweet-objects/tweet-objects/%s.json'%id
        if os.path.exists(file):
            with open(file, 'r') as json_file:
                data = json.load(json_file)
                copy_test.loc[copy_test['main_tweet_id'] == id, 'verified'] = data['user']['verified']
                copy_test.loc[copy_test['main_tweet_id'] == id, 'followers'] = data['user']['followers_count']
                copy_test.loc[copy_test['main_tweet_id'] == id, 'main_tweet'] = data['text']
                i+=1

    copy_test.to_csv(filename, index=False)

    

In [6]:
import pandas as pd
import json
t2 = pd.read_csv('./project-data/train.data.txt.csv')
t3 = pd.read_csv('./project-data/dev.data.txt.csv')
t4 = pd.read_csv('./project-data/test.data.txt.csv')


label = open('./project-data/train.label.txt', 'r')
label_data = label.readlines()
label_data = [x.strip() for x in label_data]
t2['label'] = label_data

label = open('./project-data/dev.label.txt', 'r')
label_data = label.readlines()
label_data = [x.strip() for x in label_data]
t3['label'] = label_data


label_data = label.readlines()
update_data(t2, './project-data/train.data.txt.csv')
update_data(t3, './project-data/dev.data.txt.csv')
update_data(t4, './project-data/test.data.txt.csv')


328
96
0
