In [1]:
from datetime import datetime
from datetime import timedelta
from textblob import TextBlob
import GetOldTweets3 as got
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
from gensim import models
import keras
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string
import csv
import nltk
import sklearn.metrics
import joblib
import random
from string import punctuation 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from matplotlib.lines import Line2D
%matplotlib inline

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def process(tweet):
    tweet = tweet.lower() # convert text to lower-case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    return tweet   

def tokenize(tweet):
    _stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
    tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
    return [word for word in tweet if word not in _stopwords]   

#Processing Tweets
def preprocessTweets(tweet):
    
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    
    #Convert @username to __HANDLE
    tweet = re.sub('@[^\s]+','__HANDLE',tweet)  
    
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    #trim
    tweet = tweet.strip('\'"')
    
    # Repeating words like happyyyyyyyy
    rpt_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE)
    tweet = rpt_regex.sub(r"\1\1", tweet)
    
    #Emoticons
    emoticons = \
    [
     ('__positive__',[ ':-)', ':)', '(:', '(-:', \
                       ':-D', ':D', 'X-D', 'XD', 'xD', \
                       '<3', ':\*', ';-)', ';)', ';-D', ';D', '(;', '(-;', ] ),\
     ('__negative__', [':-(', ':(', '(:', '(-:', ':,(',\
                       ':\'(', ':"(', ':((', ] ),\
    ]

    def replace_parenth(arr):
        return [text.replace(')', '[)}\]]').replace('(', '[({\[]') for text in arr]
    
    def regex_join(arr):
        return '(' + '|'.join( arr ) + ')'

    emoticons_regex = [ (repl, re.compile(regex_join(replace_parenth(regx))) ) \
            for (repl, regx) in emoticons ]
    
    for (repl, regx) in emoticons_regex :
        tweet = re.sub(regx, ' '+repl+' ', tweet)

     #Convert to lower case
    tweet = tweet.lower()
    
    return tweet

#Stemming of Tweets

def stem(tweet):
    stemmer = nltk.stem.PorterStemmer()
    tweet_stem = ''
    words = [word if(word[0:2]=='__') else word.lower() \
                for word in tweet.split() \
                if len(word) >= 3]
    words = [stemmer.stem(w) for w in words] 
    tweet_stem = ' '.join(words)
    return tweet_stem


#Predict the sentiment

def predict(tweet,classifier):

    tweet_processed = stem(preprocessTweets(tweet))

    if ( ('__positive__') in (tweet_processed)):
        sentiment  = 1
        return sentiment

    elif ( ('__negative__') in (tweet_processed)):
        sentiment  = 0
        return sentiment       
    else:
        X =  [tweet_processed]
        sentiment = classifier.predict(X)
        return (sentiment[0])

def processTweets(X_train, X_test):
    X_train = [stem(preprocessTweets(tweet)) for tweet in X_train]
    X_test = [stem(preprocessTweets(tweet)) for tweet in X_test]
    return X_train,X_test
        
# SVM classifier

def classifier(X_train,y_train):
    vec = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf = True,use_idf = True,ngram_range=(1, 2))
    svm_clf =svm.LinearSVC(C=0.1)
    vec_clf = Pipeline([('vectorizer', vec), ('pac', svm_clf)])
    vec_clf.fit(X_train,y_train)
    joblib.dump(vec_clf, 'svmClassifier.pkl', compress=3)
    return vec_clf

def getTrainingAndTestData():
    X = []
    y = []

    pos = []
    neg = []

    #Training data 1: Sentiment 140
    f=open(r'./trainingandtestdata/training_sentiment140.csv','r', encoding='ISO-8859-1')
    reader = csv.reader(f)

    for row in reader:
        X.append(row[5])
        y.append(1 if (row[0]=='4') else 0)

    #Training data 2: bonzanini
    trainData = pd.read_csv("https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv")
    trainData['class'] = [1 if x == 'pos' else 0 for x in trainData['Label']]

    for index, row in trainData.iterrows():
        X.append(row['Content'])
        y.append(row['class'])

    df = pd.DataFrame(list(zip(X,y)), columns = ['text','class'])

    for l in df['class']:
        if l == 0:
            pos.append(0)
            neg.append(1)
        elif l == 1:
            pos.append(1)
            neg.append(0)
    df['Pos']= pos
    df['Neg']= neg
    df = df[['text', 'class', 'Pos', 'Neg']]
    return df

def get_tweet_sentiment(text): 
    ''' 
    Utility function to classify sentiment of passed tweet 
    using textblob's sentiment method 
    '''
    # create TextBlob object of passed tweet text 
    analysis = TextBlob(clean_tweet(text)) 
    return analysis.sentiment.polarity

def clean_tweet(text):
    ''' 
    Utility function to clean tweet text by removing links, special characters 
    using simple regex statements. 
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", text).split()) 

In [3]:
df = getTrainingAndTestData()

In [4]:
df['Text_Clean'] = df['text'].apply(lambda x: process(x))
filtered_words = [tokenize(sen) for sen in df.Text_Clean]

df['Text_Final'] = [' '.join(sen) for sen in filtered_words]
df['tokens'] = filtered_words

In [5]:
data = df[['text','Text_Final', 'tokens', 'class', 'Pos', 'Neg']]
data.head()

Unnamed: 0,text,Text_Final,tokens,class,Pos,Neg
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww 's bummer shoulda got david carr third day,"[awww, 's, bummer, shoulda, got, david, carr, ...",0,0,1
1,is upset that he can't update his Facebook by ...,upset ca n't update facebook texting ... might...,"[upset, ca, n't, update, facebook, texting, .....",0,0,1
2,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save 50 rest go ...,"[dived, many, times, ball, managed, save, 50, ...",0,0,1
3,my whole body feels itchy and like its on fire,whole body feels itchy like fire,"[whole, body, feels, itchy, like, fire]",0,0,1
4,"@nationwideclass no, it's not behaving at all....",'s behaving 'm mad ca n't see,"['s, behaving, 'm, mad, ca, n't, see]",0,0,1


In [6]:
data_train, data_test = train_test_split(data, 
                                         test_size=0.10, 
                                         random_state=42)

In [7]:
data_train.head()

Unnamed: 0,text,Text_Final,tokens,class,Pos,Neg
369790,feels a headache brewing.,feels headache brewing,"[feels, headache, brewing]",0,0,1
582390,Cancelled plans to make other plans then those...,cancelled plans make plans plans got cancelled...,"[cancelled, plans, make, plans, plans, got, ca...",0,0,1
1509622,@Uncucumbered Been so busy reposting proxies a...,busy reposting proxies searching forgot basic ...,"[busy, reposting, proxies, searching, forgot, ...",1,1,0
1034412,"He is so cute, and he seems so sweet. I wish ...",cute seems sweet wish man like 'm fucked anyon...,"[cute, seems, sweet, wish, man, like, 'm, fuck...",1,1,0
378484,@leas sux maybe u shoulda try a martini?,sux maybe u shoulda try martini,"[sux, maybe, u, shoulda, try, martini]",0,0,1


### Split data for SVM

In [8]:
X_train = data_train['text'].tolist()
X_test = data_test['text'].tolist()
y_train = data_train['class'].tolist()
y_test = data_test['class'].tolist()

### Train SVM

In [9]:
X_train, X_test = processTweets(X_train, X_test)
vec_clf = classifier(X_train,y_train)
y_pred = vec_clf.predict(X_test)

In [10]:
print(sklearn.metrics.classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.82      0.80      0.81     79848
           1       0.81      0.83      0.82     80332

    accuracy                           0.82    160180
   macro avg       0.82      0.82      0.82    160180
weighted avg       0.82      0.82      0.82    160180



## LSTM

In [12]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

max_review_length = 280
tokenizer = Tokenizer(num_words=10000,  #max no. of unique words to keep
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 
                      lower=True #convert to lower case
                     )
tokenizer.fit_on_texts(X_train)

In [13]:
X = tokenizer.texts_to_sequences(X_train)
X = sequence.pad_sequences(X, maxlen= max_review_length)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (1441620, 280)


In [26]:
# train_data = X
y = np.array(y_train, dtype='int8')
y = 2*y - 1

Y=pd.get_dummies(y).values
# Y

np.random.seed(0)
test_inds = np.append(np.random.choice((np.where(y==-1))[0], 250, replace=False), np.random.choice((np.where(y==1))[0], 250, replace=False))
train_inds = list(set(range(len(y_train))) - set(test_inds))
train_data = X[train_inds,]
train_labels = Y[train_inds]
test_data = X[test_inds,]
test_labels = Y[test_inds]

### Create LSTM Network

In [27]:
EMBEDDING_DIM = 200
model = Sequential()
model.add(Embedding(10000, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(250, dropout=0.2,return_sequences=True))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 280, 200)          2000000   
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 280, 200)          0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 280, 250)          451000    
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               140400    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 202       
Total params: 2,591,602
Trainable params: 2,591,602
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
epochs = 1
batch_size = 40

model.fit(train_data, train_labels, 
          epochs=epochs, 
          batch_size=batch_size,
          validation_split=0.1)

Train on 1297008 samples, validate on 144112 samples
Epoch 1/1
 118440/1297008 [=>............................] - ETA: 39:12:20 - loss: 0.4503 - acc: 0.7879

KeyboardInterrupt: 

In [None]:
seq = tokenizer.texts_to_sequences(X_test)
padded = sequence.pad_sequences(seq, maxlen=max_review_length)
pred = model.predict(padded)

In [None]:
labels = [1, 0]
prediction_labels=[]
for p in pred:
    prediction_labels.append(labels[np.argmax(p)])

In [None]:
print(sklearn.metrics.classification_report(y_test, prediction_labels)) 