# Data ingestion

In [2]:
import tensorflow as tf
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import cufflinks as cf
import plotly
import plotly.graph_objs as go
import datetime
import boto3
import s3fs
pd.set_option('display.max_colwidth', -1)
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [3]:
bucketname = 'ops-vw-interns-climate-perception-tweets'
dataframe = pd.DataFrame(columns=('creation date', 'tweet', 
                                  'username'))
s3 = boto3.resource('s3')

s3_tweets = s3.Bucket(bucketname).objects.filter(Prefix='raw-data/', Delimiter='/').all()
for file in s3_tweets:
    temp_df = pd.read_json('s3://{}/{}'.format(bucketname, file.key))
    dataframe = dataframe.append(temp_df, sort=False, ignore_index=True)
dataframe_tweets = dataframe['tweet']
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183957 entries, 0 to 183956
Data columns (total 3 columns):
creation date    183957 non-null object
tweet            183957 non-null object
username         183957 non-null object
dtypes: object(3)
memory usage: 4.2+ MB


In [4]:
pattern = 'climate change|climatechange|global warming|globalwarming'
dataframe = dataframe[dataframe.tweet.str.contains('(?i)'+pattern)]
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157533 entries, 0 to 183956
Data columns (total 3 columns):
creation date    157533 non-null object
tweet            157533 non-null object
username         157533 non-null object
dtypes: object(3)
memory usage: 4.8+ MB


In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ViMs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Preprocessing of tweet texts
def format_tweet(tweet):
    processed_tweet=''
    for word in tweet.split():
        
        #Remove stopwords
        if not word in stop_words:
        
            # Removing URL from tweet
            processed_word = re.sub('([^0-9A-Za-z \t])|(\w+:\/\/\S+)', ' ', word)

            # remove all single characters
            processed_word = re.sub('\s+[a-zA-Z]\s+', ' ', processed_word)

            # Remove single characters from the start
            processed_word = re.sub('\^[a-zA-Z]\s+', ' ', processed_word) 

            # Substituting multiple spaces with single space
            processed_word = re.sub('\s+', '', processed_word, flags=re.I)

            # Removing prefixed 'b'
            processed_word = re.sub('^b\s+', '', processed_word)

            # Removing &amp
            processed_word = re.sub('&amp', '&', processed_word)
            processed_word = re.sub('amp', '', processed_word)

            # Removing breaks
            processed_word = re.sub('<br/>', '', processed_word)

            # converts to lower
            processed_word = processed_word.lower()

            processed_tweet= processed_tweet+' '+processed_word

    return processed_tweet        
    
dataframe['tweet'] = [format_tweet(tweet) for tweet in dataframe['tweet']]
print('unprocessed tweet: \n'+ dataframe['tweet'][15])
print('\nprocessed tweet: \n' + dataframe['tweet'][15])

unprocessed tweet: 
 aoc my dreams motherhood now bittersweet because global warming  

processed tweet: 
 aoc my dreams motherhood now bittersweet because global warming  


In [7]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ViMs\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
def get_sentiment(polarity):
    if polarity > 0:
        return 'positive'
    if polarity == 0:
        return 'neutral'
    else:
        return 'negative'

word_count = [len(tweet.split()) for tweet in dataframe['tweet']]

polarity_tweets = [round(sid.polarity_scores(tweet)['compound'], 2)
                   for tweet in dataframe['tweet']]

sentiment_tweets = [get_sentiment(polarity) for polarity in polarity_tweets]

zipped_list = list(zip(sentiment_tweets, polarity_tweets,
                       dataframe['tweet'], word_count, dataframe['username'], 
                       dataframe['creation date']))


# Store to new dataframe with sentiment values and polarities
sentiment_df = pd.DataFrame(zipped_list, 
                            columns=['Sentiment', 'Polarity', 
                                     'Tweet', 'Word count', 'Username', 
                                     'Creation Date'])
sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157533 entries, 0 to 157532
Data columns (total 6 columns):
Sentiment        157533 non-null object
Polarity         157533 non-null float64
Tweet            157533 non-null object
Word count       157533 non-null int64
Username         157533 non-null object
Creation Date    157533 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 7.2+ MB


In [9]:
def get_sentiment(polarity):
    if polarity > 0:
        return 'positive'
    if polarity == 0:
        return 'neutral'
    else:
        return 'negative'

word_count = [len(tweet.split()) for tweet in dataframe['tweet']]

polarity_tweets = [round(sid.polarity_scores(tweet)['compound'], 2)
                   for tweet in dataframe['tweet']]

sentiment_tweets = [get_sentiment(polarity) for polarity in polarity_tweets]

dataframe['word count'] = word_count
dataframe['polarity'] = polarity_tweets
dataframe['sentiment'] = sentiment_tweets
dataframe

Unnamed: 0,creation date,tweet,username,word count,polarity,sentiment
0,2019-10-10 11:33:39,im democracy freedom speech unless course were talking climate change protesters guys beaten up pepper sprayed locked andor ran cops inconvenience me,7c1f90bd9bdc70cc059640a7a6209389,22,-0.25,negative
1,2019-10-10 11:33:21,trees bans new technology arsenal chelsea spurs climate change coyg afc,445730c4efd881eb829d96f12e4319d2,11,0.00,neutral
2,2019-10-10 11:33:17,doug fords conservative government made next progress plan cut carbon emissions spent millions fighting real plan cut emissions onpoli cdnpoli climatechange elxn43,9bd8e0dfed263f561f5ddde60c39e700,22,-0.44,negative
3,2019-10-10 11:33:02,the saudi kings oil company chevron russias mob run gazprom top three producers greenhouse gases driving entire worlds climate change trump republicans best people trumpgenocide climatestrike republicangenocide,6422870a3db0164455e8a89116207558,27,0.72,positive
4,2019-10-12 21:00:26,global warming joke its bad science its high jacking public policy its greatest scam history,c54d49297625ed6860a07d9cba0f875d,15,-0.05,negative
6,2019-10-12 21:00:22,terrorism killer heatwaves deadly drought predicted spain climate change study reveals mediterranean warming 20 faster global average,3b1b1f77ed7014eef44d54954d7187d9,17,-0.85,negative
7,2019-10-12 21:00:19,salem thanks listening rant socioeconomic implications climate change red wine something weird me hereforyou lyftreviews winewednesday,6fad50840dd41242695277924e4a6f16,16,-0.05,negative
9,2019-10-12 21:00:16,is climate change doomsday coming,243923b928994c1d2bc22897ffbd8719,5,-0.59,negative
11,2019-10-12 21:00:14,someone research paper much carbon footprint wastage metal straws created everyone buys dont use still use plastic straws talk saving world you making worst climate change inevitable,77ba5036eccabee6c58dee4411c8135f,27,-0.48,negative
13,2019-10-12 21:00:08,yeah fing fools want school climate change depopulation estimate 227 million 2025 lets hope theyre included that,705da23959fa17c5d11d7a53a6157a19,17,0.30,positive


In [10]:
sentiment_df['Sentiment'].iplot(
    kind='hist',
    linecolor='black',
    theme='ggplot',
    xTitle='Sentiment',
    yTitle='count',
    title='Class distribution')
plt.show()

In [99]:
neu_tweets = dataframe[dataframe['sentiment']=='neutral'][['sentiment', 'tweet']]
pos_tweets = dataframe[dataframe['sentiment']=='positive'][['sentiment', 'tweet']][:len(neu_tweets)]
neg_tweets = dataframe[dataframe['sentiment']=='negative'][:len(neu_tweets)]
print(len(pos_tweets))
print(len(neu_tweets))
print(len(neg_tweets))

balanced_features = pos_tweets['tweet'].append(neu_tweets['tweet']).append(neg_tweets['tweet'])
balanced_labels = pos_tweets['sentiment'].append(neu_tweets['sentiment']).append(neg_tweets['sentiment'])
balanced_features = [str(feature) for feature in balanced_features]

26975
26975
26975


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(balanced_features,
                                                    balanced_labels, test_size=0.2,
                                                    random_state=1)

# Tensorflow

## Text classification with an RNN

### Embedding layer 
Converts sequence of word in sequences of vectors. Words with similar meanings often have similar vectors

In [100]:
import tensorflow_datasets as tfds

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_lenght = max([len(tweet.split()) for tweet in balanced_features])

tf_tokenizer = Tokenizer()
tf_tokenizer.fit_on_texts(balanced_features)

voc_size = len(tf_tokenizer.word_index)+1

balanced_train_features = tf_tokenizer.texts_to_sequences(X_train)
balanced_test_features = tf_tokenizer.texts_to_sequences(X_test)

converted_train_features = pad_sequences(balanced_train_features, 
                                         maxlen=max_lenght, padding='post')

converted_test_features = pad_sequences(balanced_test_features, 
                                         maxlen=max_lenght, padding='post'

def convert_label(label):
    if label == 'negative':
        return 0
    elif label == 'neutral':
        return 1
    else:
        return 2
    
converted_train_labels = [convert_label(label) for label in y_train]
converted_train_labels = tf.keras.utils.to_categorical(converted_train_labels, 3, dtype='float32')

                                        
converted_test_labels = [convert_label(label) for label in y_test]
converted_test_labels = tf.keras.utils.to_categorical(converted_test_labels, 3, dtype='float32')

In [None]:
model = tf.keras.Sequential([
    # Layer for wordvector training
    tf.keras.layers.Embedding(voc_size, 64, input_length=max_lenght),
    # Layer to improve performance based on hardware and prevent overfitting
    tf.keras.layers.GRU(units=32, dropout=0.2, recurrent_dropout=0.2)
    # Recurrent layer with long short term memory cells
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    # Layers for cells with activation functions
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

In [87]:
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics=['accuracy'])

model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 58, 64)            4577472   
_________________________________________________________________
bidirectional_8 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_16 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_17 (Dense)             (None, 3)                 195       
Total params: 4,651,971
Trainable params: 4,651,971
Non-trainable params: 0
_________________________________________________________________


In [88]:
history = model.fit(x=converted_train_features,y=converted_train_labels,
                    epochs=4, validation_data=(converted_test_features, converted_test_labels), verbose=2)

Train on 64740 samples, validate on 16185 samples
Epoch 1/10
64740/64740 - 297s - loss: 0.3717 - accuracy: 0.8631 - val_loss: 0.5914 - val_accuracy: 0.8157
Epoch 2/10
64740/64740 - 308s - loss: 0.1369 - accuracy: 0.9563 - val_loss: 0.6207 - val_accuracy: 0.8263
Epoch 3/10
64740/64740 - 299s - loss: 0.0790 - accuracy: 0.9729 - val_loss: 0.8990 - val_accuracy: 0.7975
Epoch 4/10
64740/64740 - 295s - loss: 0.0474 - accuracy: 0.9831 - val_loss: 0.9023 - val_accuracy: 0.8062
Epoch 5/10
64740/64740 - 305s - loss: 0.0262 - accuracy: 0.9907 - val_loss: 1.3375 - val_accuracy: 0.7621
Epoch 6/10
64740/64740 - 305s - loss: 0.0132 - accuracy: 0.9956 - val_loss: 1.8042 - val_accuracy: 0.7240
Epoch 7/10
64740/64740 - 279s - loss: 0.0070 - accuracy: 0.9979 - val_loss: 1.2435 - val_accuracy: 0.7993
Epoch 8/10
64740/64740 - 298s - loss: 0.0049 - accuracy: 0.9986 - val_loss: 2.0687 - val_accuracy: 0.7347
Epoch 9/10
64740/64740 - 317s - loss: 0.0042 - accuracy: 0.9986 - val_loss: 1.9030 - val_accuracy: 0.7

In [96]:
test_samples = ['I hate global warming', 'I love it']
test_samples = tf_tokenizer.texts_to_sequences(test_samples)
test_samples = pad_sequences(test_samples, maxlen=max_lenght)

model.predict(test_samples)

array([[3.5180591e-07, 9.9998605e-01, 1.3562473e-05],
       [1.0537495e-06, 9.9996877e-01, 3.0128394e-05]], dtype=float32)