In [1]:
from datetime import datetime
from datetime import timedelta
from textblob import TextBlob
import GetOldTweets3 as got
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
from gensim import models
import keras
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string
import csv
import nltk
import sklearn.metrics
import joblib
import random
from string import punctuation 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from matplotlib.lines import Line2D
%matplotlib inline

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def process(tweet):
    tweet = tweet.lower() # convert text to lower-case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    return tweet   

def tokenize(tweet):
    _stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
    tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
    return [word for word in tweet if word not in _stopwords]   

#Processing Tweets
def preprocessTweets(tweet):
    
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    
    #Convert @username to __HANDLE
    tweet = re.sub('@[^\s]+','__HANDLE',tweet)  
    
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    #trim
    tweet = tweet.strip('\'"')
    
    # Repeating words like happyyyyyyyy
    rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE)
    tweet = rpt_regex.sub(r"\1\1", tweet)
    
    #Emoticons
    emoticons = \
    [
     ('__positive__',[ ':-)', ':)', '(:', '(-:', \
                       ':-D', ':D', 'X-D', 'XD', 'xD', \
                       '<3', ':\*', ';-)', ';)', ';-D', ';D', '(;', '(-;', ] ),\
     ('__negative__', [':-(', ':(', '(:', '(-:', ':,(',\
                       ':\'(', ':"(', ':((', ] ),\
    ]

    def replace_parenth(arr):
        return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr]
    
    def regex_join(arr):
        return '(' + '|'.join( arr ) + ')'

    emoticons_regex = [ (repl, re.compile(regex_join(replace_parenth(regx))) ) \
            for (repl, regx) in emoticons ]
    
    for (repl, regx) in emoticons_regex :
        tweet = re.sub(regx, ' '+repl+' ', tweet)

     #Convert to lower case
    tweet = tweet.lower()
    
    return tweet

#Stemming of Tweets

def stem(tweet):
    stemmer = nltk.stem.PorterStemmer()
    tweet_stem = ''
    words = [word if(word[0:2]=='__') else word.lower() \
                for word in tweet.split() \
                if len(word) >= 3]
    words = [stemmer.stem(w) for w in words] 
    tweet_stem = ' '.join(words)
    return tweet_stem


#Predict the sentiment

def predict(tweet,classifier):

    tweet_processed = stem(preprocessTweets(tweet))

    if ( ('__positive__') in (tweet_processed)):
        sentiment  = 1
        return sentiment

    elif ( ('__negative__') in (tweet_processed)):
        sentiment  = 0
        return sentiment       
    else:
        X =  [tweet_processed]
        sentiment = classifier.predict(X)
        return (sentiment[0])

def processTweets(X_train, X_test):
    X_train = [stem(preprocessTweets(tweet)) for tweet in X_train]
    X_test = [stem(preprocessTweets(tweet)) for tweet in X_test]
    return X_train,X_test
        
# SVM classifier

def classifier(X_train,y_train):
    vec = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf = True,use_idf = True,ngram_range=(1, 2))
    svm_clf =svm.LinearSVC(C=0.1)
    vec_clf = Pipeline([('vectorizer', vec), ('pac', svm_clf)])
    vec_clf.fit(X_train,y_train)
    joblib.dump(vec_clf, 'svmClassifier.pkl', compress=3)
    return vec_clf

def getTrainingAndTestData():
    X = []
    y = []

    pos = []
    neg = []

    #Training data 1: Sentiment 140
    f=open(r'./trainingandtestdata/training_sentiment140.csv','r', encoding='ISO-8859-1')
    reader = csv.reader(f)

    for row in reader:
        X.append(row[5])
        y.append(1 if (row[0]=='4') else 0)

    #Training data 2: bonzanini
    trainData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv")
    trainData['class'] = [1 if x == 'pos' else 0 for x in trainData['Label']]

    for index, row in trainData.iterrows():
        X.append(row['Content'])
        y.append(row['class'])

    df = pd.DataFrame(list(zip(X,y)), columns = ['text','class'])

    for l in df['class']:
        if l == 0:
            pos.append(0)
            neg.append(1)
        elif l == 1:
            pos.append(1)
            neg.append(0)
    df['Pos']= pos
    df['Neg']= neg
    df = df[['text', 'class', 'Pos', 'Neg']]
    return df

def get_tweet_sentiment(text): 
    ''' 
    Utility function to classify sentiment of passed tweet 
    using textblob's sentiment method 
    '''
    # create TextBlob object of passed tweet text 
    analysis = TextBlob(clean_tweet(text)) 
    return analysis.sentiment.polarity

def clean_tweet(text):
    ''' 
    Utility function to clean tweet text by removing links, special characters 
    using simple regex statements. 
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split()) 

In [3]:
df = getTrainingAndTestData()

In [16]:
df['Text_Clean'] = df['text'].apply(lambda x: process(x))
filtered_words = [tokenize(sen) for sen in df.Text_Clean]

df['Text_Final'] = [' '.join(sen) for sen in filtered_words]
df['tokens'] = filtered_words

In [17]:
data = df[['text','Text_Final', 'tokens', 'class', 'Pos', 'Neg']]
data.head()

Unnamed: 0,text,Text_Final,tokens,class,Pos,Neg
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww 's bummer shoulda got david carr third day,"[awww, 's, bummer, shoulda, got, david, carr, ...",0,0,1
1,is upset that he can't update his Facebook by ...,upset ca n't update facebook texting ... might...,"[upset, ca, n't, update, facebook, texting, .....",0,0,1
2,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save 50 rest go ...,"[dived, many, times, ball, managed, save, 50, ...",0,0,1
3,my whole body feels itchy and like its on fire,whole body feels itchy like fire,"[whole, body, feels, itchy, like, fire]",0,0,1
4,"@nationwideclass no, it's not behaving at all....",'s behaving 'm mad ca n't see,"['s, behaving, 'm, mad, ca, n't, see]",0,0,1


In [18]:
data_train, data_test = train_test_split(data, 
                                         test_size=0.10, 
                                         random_state=42)

In [19]:
data_train.head()

Unnamed: 0,text,Text_Final,tokens,class,Pos,Neg
369790,feels a headache brewing.,feels headache brewing,"[feels, headache, brewing]",0,0,1
582390,Cancelled plans to make other plans then those...,cancelled plans make plans plans got cancelled...,"[cancelled, plans, make, plans, plans, got, ca...",0,0,1
1509622,@Uncucumbered Been so busy reposting proxies a...,busy reposting proxies searching forgot basic ...,"[busy, reposting, proxies, searching, forgot, ...",1,1,0
1034412,"He is so cute, and he seems so sweet. I wish ...",cute seems sweet wish man like 'm fucked anyon...,"[cute, seems, sweet, wish, man, like, 'm, fuck...",1,1,0
378484,@leas sux maybe u shoulda try a martini?,sux maybe u shoulda try martini,"[sux, maybe, u, shoulda, try, martini]",0,0,1


### Split data for SVM

In [20]:
X_train = data_train['text'].tolist()
X_test = data_test['text'].tolist()
y_train = data_train['class'].tolist()
y_test = data_test['class'].tolist()

### Train SVM

In [25]:
X_train, X_test = processTweets(X_train, X_test)
vec_clf = classifier(X_train,y_train)
y_pred = vec_clf.predict(X_test)

NameError: name 'sklearn' is not defined

In [27]:
print(sklearn.metrics.classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.82      0.80      0.81     79848
           1       0.81      0.83      0.82     80332

    accuracy                           0.82    160180
   macro avg       0.82      0.82      0.82    160180
weighted avg       0.82      0.82      0.82    160180



#### Text Blob

In [30]:
x = data_test['text'].tolist()
sentiment_list = [get_tweet_sentiment(clean_tweet(t)) for t in x]

[0.0, -0.43333333333333335, -0.75, 0.0, 0.5, 0.17045454545454544, 0.0571428571428572, 0.2777777777777778, 0.0, -0.05, 0.0, 0.23611111111111108, 0.0, -0.3, 0.10000000000000002, -0.30000000000000004, 0.29444444444444445, -0.3, 0.0, 0.39583333333333337, 0.3, 0.0, 0.1, 0.0, 0.0, 0.5, 0.8, 0.225, 0.425, 0.0, -0.5, -0.8, 1.0, 0.0, 0.2, 0.0, 0.13636363636363635, 1.0, -0.01666666666666668, 0.0, 0.6375000000000001, 0.012500000000000039, -0.6, 0.0, 0.0, -0.6, 0.25833333333333336, 0.75, -0.2, 0.0, 0.16818181818181818, 0.425, 0.19285714285714284, 0.0, 0.0, 0.2, 0.25, -0.3, 0.9, 0.1, 0.0, 0.0, -0.5, -0.25, 0.0, 0.25, 0.5952380952380952, 0.0, 0.0, -0.04583333333333334, 0.43333333333333335, 0.0, -0.6095238095238096, -0.8, 0.0, 0.0, 0.0, -0.375, 0.022222222222222213, 0.5, 0.0, 0.0, 0.11499999999999999, 0.15, 0.0, 0.0, -0.3047619047619048, 1.0, 0.0, 0.0, 0.0, -0.35000000000000003, 0.35, 0.2, 0.06666666666666667, -0.2, 0.375, 0.425, -0.4, 0.22348484848484848, 0.2857142857142857, 0.0, 0.01481481481481479

In [32]:
pred = []
for sent in sentiment_list:
    if sent > 0:
        pred.append(1)
    elif sent < 0:
        pred.append(0)
    else:
        k = random.randint(0, 1)
        pred.append(k)

Unnamed: 0,0,1
0,0.0,0
1,-0.433333,0
2,-0.75,0
3,0.0,0
4,0.5,1


In [33]:
print(sklearn.metrics.classification_report(y_test, pred)) 

              precision    recall  f1-score   support

           0       0.65      0.50      0.57     79848
           1       0.60      0.73      0.66     80332

    accuracy                           0.62    160180
   macro avg       0.62      0.62      0.61    160180
weighted avg       0.62      0.62      0.61    160180



### CNN

In [34]:
## build training vocabulary and get maximum training sentence length and total number of words training data
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

11819881 words total, with a vocabulary size of 353145
Max sentence length is 1477


In [35]:
all_test_words = [word for tokens in data_test['tokens'] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test['tokens']]
TEST_VOCAB = sorted(list(set(all_test_words)))
print('%s words total, with a vocabulary size of %s' % (len(all_test_words), len(TEST_VOCAB)))
print('Max sentence length is %s' % max(test_sentence_lengths))

1312494 words total, with a vocabulary size of 85910
Max sentence length is 1072


### Loading Word2vec from googlenews
* Download word2vec from here https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

In [36]:
word2vec_path = './word2vec/GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [37]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [38]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

### Tokenize and Pad sequences
Each word is assigned an integer and that integer is placed in a list. As all the training sentences must have same input shape we pad the sentences.

In [39]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 287239 unique tokens.


In [40]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [41]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(287240, 300)


In [42]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

### Define CNN
Text as a sequence is passed to a CNN. The embeddings matrix is passed to embedding_layer. Five different filter sizes are applied to each comment, and GlobalMaxPooling1D layers are applied to each layer. All the outputs are then concatenated. A Dropout layer then Dense then Dropout and then Final Dense layer is applied.

In [43]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [44]:
label_names = ['Pos', 'Neg']

In [45]:
y_tr = data_train[label_names].values

In [46]:
x_train = train_cnn_data

In [47]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

W0804 09:25:51.068963 46448 deprecation_wrapper.py:119] From C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0804 09:25:51.338022 46448 deprecation_wrapper.py:119] From C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0804 09:25:51.382022 46448 deprecation_wrapper.py:119] From C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0804 09:25:51.436024 46448 deprecation_wrapper.py:119] From C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\keras\backend\tensorflow_backe

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 300)      86172000    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 49, 200)      120200      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 48, 200)      180200      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

### Train CNN

In [48]:
num_epochs = 10
batch_size = 34

In [49]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Train on 1297458 samples, validate on 144162 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Test CNN and Score

In [50]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [51]:
labels = [1, 0]
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [52]:
print(sklearn.metrics.classification_report(y_test, prediction_labels)) 

              precision    recall  f1-score   support

           0       0.77      0.79      0.78     79848
           1       0.79      0.77      0.78     80332

    accuracy                           0.78    160180
   macro avg       0.78      0.78      0.78    160180
weighted avg       0.78      0.78      0.78    160180



## LSTM