In [1]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from cleaner import *
import re
import string
from string import punctuation
from random import shuffle
import warnings
warnings.filterwarnings('ignore')

import gensim
from gensim.models.word2vec import Word2Vec 
LabeledSentence = gensim.models.doc2vec.LabeledSentence 

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from generator import*
def ingest():
    data = pd.read_csv('meningitis/data.csv')
    data.drop(['date'], axis=1, inplace=True)
    data.drop(['username'], axis=1, inplace=True)
    data.drop(['retweets numbers'], axis=1, inplace=True)
    data = data[data.text.isnull() == False]
    data = data[data['text'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print('dataset loaded with shape', data.shape)    
    return data

data = ingest()

dataset loaded with shape (87215, 3)


In [2]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [3]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

In [4]:
from generator import*
data_xml=[]

stop_words = ['ourselves', 'hers', 'between', 'yourself', 
              'but', 'again', 'there', 'about', 'once', 
              'during', 'out', 'very', 'with', 'they',
              'own', 'an', 'be', 'some', 'for', 'do', 'its', 
              'yours', 'such', 'into', 'of', 'most', 'itself',
              'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 
              'from', 'him', 'each', 'the', 'themselves', 'until', 
              'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don',
              'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 
              'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 
              'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 
              'been', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because',
              'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you',
              'herself', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after',
              'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 
              'how', 'further', 'was', 'here', 'than']

def tokenize(tweet):
    tweet = nlp(tweet)
    tweet = negation_tag(tweet)
    tweet = ' '.join(tweet)
    tweet=clean_text_round1(tweet)
    tweet=clean_text_round2(tweet)
    tokens = tokenizer.tokenize(tweet.lower())
    tokens=[w for w in tokens if not w in stop_words]
    
    try:
        temp=[]
        tokens = list(filter(lambda t: not t.startswith('@'), tokens))
        tokens = list(filter(lambda t: not t.startswith('#'), tokens))
        tokens = list(filter(lambda t: not t.startswith('http'), tokens))
        tokens = list(filter(lambda t: not t.startswith('www'), tokens))
        #temp.append(' '.join(tokens))
        data_xml.append(' '.join(tokens))
        return tokens
    except:
        return 'NC'

In [5]:
vector=[]
def feature_extraction(tweet):
    doc=convert_to_doc(tweet)
    vector.append(generator_vec(doc))


In [6]:
def postprocess(data, n=87215):
    data = data.head(n)
    data['tokens'] = data['text'].progress_map(tokenize)
    #data['tweet'] = data['text'].values
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    
    return data

data = postprocess(data)

progress-bar: 100%|██████████| 87215/87215 [26:08<00:00, 55.60it/s]    


In [7]:
df=pd.DataFrame(data=data_xml, columns=['Tweet'])

In [8]:
df.shape

(87215, 1)

In [9]:
def postprocess(data, n=87215):
    data = data.head(n)
    data['Tweet'].progress_map(feature_extraction)

postprocess(df)

progress-bar: 100%|██████████| 87215/87215 [17:22<00:00, 83.68it/s]  


In [10]:
my_dataframe=pd.DataFrame(data=vector, columns=['Tweet','INFE','TOTH','TSEL','NEWS','CAMP','CONC','VACC','NEGA'])

## Infection

In [11]:
infection_dataframe=my_dataframe[(my_dataframe.INFE =='Yes') & (my_dataframe.NEWS =='No') & (my_dataframe.CAMP =='No') & 
                                 (my_dataframe.CONC =='No') & (my_dataframe.VACC =='No') & (my_dataframe.NEGA =='No')]

In [12]:
infection_dataframe.shape

(3607, 9)

In [13]:
infection_dataframe["Category"] = 'Infection'

## Concern

In [14]:
concern_dataframe=my_dataframe[(my_dataframe.INFE =='Yes') & (my_dataframe.NEWS =='No') & (my_dataframe.CAMP =='No') & 
                                 (my_dataframe.CONC =='Yes') & (my_dataframe.VACC =='No') & (my_dataframe.NEGA =='No')]

In [15]:
concern_dataframe.shape

(94, 9)

In [16]:
concern_dataframe["Category"] = 'Concern'

## News

In [17]:
news_dataframe=my_dataframe[(my_dataframe.INFE =='No') & (my_dataframe.NEWS =='Yes') & (my_dataframe.CAMP =='No') & 
                                 (my_dataframe.CONC =='No') & (my_dataframe.VACC =='No') & (my_dataframe.NEGA =='No')]

In [18]:
news_dataframe.shape

(1394, 9)

In [19]:
news_dataframe["Category"] = 'News'

## Campaign

In [20]:
campaign_dataframe=my_dataframe[(my_dataframe.INFE =='No') & (my_dataframe.NEWS =='No') & (my_dataframe.CAMP =='Yes') & 
                                 (my_dataframe.CONC =='No') & (my_dataframe.VACC =='No') & (my_dataframe.NEGA =='No')]

In [21]:
campaign_dataframe.shape

(971, 9)

In [22]:
campaign_dataframe["Category"] = 'Campaign'

## Vaccine

In [23]:
vaccine_dataframe=my_dataframe[(my_dataframe.TOTH =='No') & (my_dataframe.INFE =='No') & (my_dataframe.NEWS =='No') & (my_dataframe.CAMP =='No') & 
                                 (my_dataframe.CONC =='No') & (my_dataframe.VACC =='Yes') & (my_dataframe.NEGA =='No')]

In [24]:
vaccine_dataframe.shape

(1908, 9)

In [25]:
vaccine_dataframe["Category"] = 'Vaccine'

### Merging the categories

In [26]:
df_f = concern_dataframe.append([campaign_dataframe])

In [27]:
df = pd.read_csv('data/final_data.csv')

In [28]:
df.shape

(92105, 12)

In [29]:
df_final=df.append([df_f])

In [30]:
df_final.shape

(93170, 12)

In [31]:
df_final.to_csv('data/final_data.csv')