In [1]:
#NLP with Twitter

#This dataset contains 1.6 million tweets

#Import Libraries
from __future__ import print_function
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
from datetime import datetime
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_classification, make_moons, make_circles
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
import keras.backend as K
from keras.wrappers.scikit_learn import KerasClassifier
import gensim

Using TensorFlow backend.


In [2]:
#Loading the dataset
train = pd.read_csv('C:\\Users\\sagi\\Desktop\\Learning\\ML\\Datasets\\Twitter.csv', encoding = "ISO-8859-1", names = ['1','2','3','4', "tweet"],header=None)

In [3]:
#Only interested in the tweets
train.drop(['1','2','3','4'],axis=1,inplace=True)

In [4]:
#Explore data
train.head()

Unnamed: 0,tweet
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."


In [5]:
#Number of Words
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" ")))
train[['tweet','word_count']].head()

Unnamed: 0,tweet,word_count
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",20
1,is upset that he can't update his Facebook by ...,22
2,@Kenichan I dived many times for the ball. Man...,19
3,my whole body feels itchy and like its on fire,11
4,"@nationwideclass no, it's not behaving at all....",22


In [6]:
#Number of characters
train['char_count'] = train['tweet'].str.len() # also includes spaces
train[['tweet','char_count']].head()

Unnamed: 0,tweet,char_count
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",115
1,is upset that he can't update his Facebook by ...,111
2,@Kenichan I dived many times for the ball. Man...,89
3,my whole body feels itchy and like its on fire,47
4,"@nationwideclass no, it's not behaving at all....",111


In [7]:
#Average Word Length
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['tweet'].apply(lambda x: avg_word(x))
train[['tweet','avg_word']].head()

Unnamed: 0,tweet,avg_word
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",5.052632
1,is upset that he can't update his Facebook by ...,4.285714
2,@Kenichan I dived many times for the ball. Man...,3.944444
3,my whole body feels itchy and like its on fire,3.7
4,"@nationwideclass no, it's not behaving at all....",4.285714


In [8]:
#Number of stopwords

#First get the stopwords from NLTK
from nltk.corpus import stopwords
stop = stopwords.words('english')

train['stopwords'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['tweet','stopwords']].head()

Unnamed: 0,tweet,stopwords
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",4
1,is upset that he can't update his Facebook by ...,8
2,@Kenichan I dived many times for the ball. Man...,5
3,my whole body feels itchy and like its on fire,4
4,"@nationwideclass no, it's not behaving at all....",10


In [9]:
#Number of special characters (hashtags for example)
train['hastags'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['tweet','hastags']].head()

Unnamed: 0,tweet,hastags
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [10]:
#Number of numerics
train['numerics'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['tweet','numerics']].head()

Unnamed: 0,tweet,numerics
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [11]:
#Number of Uppercase words
train['upper'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['tweet','upper']].head()

Unnamed: 0,tweet,upper
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",1
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,1
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",1


In [12]:
#Preprocess the data

#Lower case everything
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['tweet'].head()

0    @switchfoot http://twitpic.com/2y1zl - awww, t...
1    is upset that he can't update his facebook by ...
2    @kenichan i dived many times for the ball. man...
3       my whole body feels itchy and like its on fire
4    @nationwideclass no, it's not behaving at all....
Name: tweet, dtype: object

In [13]:
#Removing Punctuation
train['tweet'] = train['tweet'].str.replace('[^\w\s]','')
train['tweet'].head()

0    switchfoot httptwitpiccom2y1zl  awww thats a b...
1    is upset that he cant update his facebook by t...
2    kenichan i dived many times for the ball manag...
3       my whole body feels itchy and like its on fire
4    nationwideclass no its not behaving at all im ...
Name: tweet, dtype: object

In [14]:
#Removal of Stop Words- again, use NLTK
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['tweet'].head()

0    switchfoot httptwitpiccom2y1zl awww thats bumm...
1    upset cant update facebook texting might cry r...
2    kenichan dived many times ball managed save 50...
3                     whole body feels itchy like fire
4             nationwideclass behaving im mad cant see
Name: tweet, dtype: object

In [15]:
#Common word removal
#Let's check the 10 most common words
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10]
freq

im       125283
get       56091
day       54024
go        53447
like      52246
dont      51959
work      50206
cant      49308
good      48389
today     45181
dtype: int64

In [16]:
#Remove these words-they will not be useful in classification
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

0    switchfoot httptwitpiccom2y1zl awww thats bumm...
1    upset update facebook texting might cry result...
2    kenichan dived many times ball managed save 50...
3                          whole body feels itchy fire
4                     nationwideclass behaving mad see
Name: tweet, dtype: object

In [17]:
#Rare words removal
#Let's check for the 10 least used words
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
freq

shemaiahc              1
httptwitpiccom5oqp5    1
crashhhhhh             1
someonewhered          1
lesliewaltzes          1
twitterbugsback        1
suya                   1
smmguide               1
wooont                 1
apoloyais              1
dtype: int64

In [18]:
#Remove them
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

0    switchfoot httptwitpiccom2y1zl awww thats bumm...
1    upset update facebook texting might cry result...
2    kenichan dived many times ball managed save 50...
3                          whole body feels itchy fire
4                     nationwideclass behaving mad see
Name: tweet, dtype: object

In [19]:
#Spelling correction
#Since it will take quite some time to run this code, I only did it for the first 5 tweets
from textblob import TextBlob
train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    switchfoot httptwitpiccom2y1zl www that summer...
1    upset update facebook testing might cry result...
2    kenichan dived many times ball managed save 50...
3                          whole body feels itchy fire
4                     nationwideclass behaving mad see
Name: tweet, dtype: object

In [20]:
#Tokenization - dividing the text into a sequence of words/sentences
TextBlob(str(train['tweet'])).words #first transform the tweets into a blob and then converted them into a series of words.

WordList(['0', 'switchfoot', 'httptwitpiccom2y1zl', 'awww', 'thats', 'bumm', '1', 'upset', 'update', 'facebook', 'texting', 'might', 'cry', 'result', '2', 'kenichan', 'dived', 'many', 'times', 'ball', 'managed', 'save', '50', '3', 'whole', 'body', 'feels', 'itchy', 'fire', '4', 'nationwideclass', 'behaving', 'mad', 'see', '5', 'kwesidei', 'whole', 'crew', '6', 'need', 'hug', '7', 'loltrish', 'hey', 'long', 'time', 'see', 'yes', 'rains', 'bit', 'bit', 'l', '8', 'tatiana_k', 'nope', 'didnt', '9', 'twittera', 'que', 'muera', '10', 'spring', 'break', 'plain', 'city', 'snowing', '11', 'repierced', 'ears', '12', 'caregiving', 'couldnt', 'bear', 'watch', 'thought', 'ua', 'loss', '13', 'octolinz16', 'counts', 'idk', 'either', 'never', 'talk', 'anymore', '14', 'smarrison', 'wouldve', 'first', 'didnt', 'gun', 'really', 'thoug', '15', 'iamjazzyfizzle', 'wish', 'got', 'watch', 'miss', 'iamlilnicki', '16', 'hollis', 'death', 'scene', 'hurt', 'severely', 'watch', 'film', 'wr', '17', 'file', 'taxes',

In [21]:
#Stemming -removal of suffices, like “ing”, “ly”, “s”, etc.
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['tweet'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0    switchfoot httptwitpiccom2y1zl awww that bumme...
1    upset updat facebook text might cri result sch...
2    kenichan dive mani time ball manag save 50 res...
3                           whole bodi feel itchi fire
4                        nationwideclass behav mad see
Name: tweet, dtype: object

In [22]:
#Lemmatization- converts the word into its root word, rather than just stripping the suffices (more effective than stemming)
#Make use of vocabulary and does a morphological analysis to obtain the root word
from textblob import Word
train['tweet'] = train['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['tweet'].head()

0    switchfoot httptwitpiccom2y1zl awww thats bumm...
1    upset update facebook texting might cry result...
2    kenichan dived many time ball managed save 50 ...
3                           whole body feel itchy fire
4                     nationwideclass behaving mad see
Name: tweet, dtype: object

In [23]:
#Advance Text Processing

#N-grams - combination of multiple words used together (capture the language structure; what letter or word is likely to follow the given one)
TextBlob(str(train['tweet'][0])).ngrams(2) #bigram - 2 words

[WordList(['switchfoot', 'httptwitpiccom2y1zl']),
 WordList(['httptwitpiccom2y1zl', 'awww']),
 WordList(['awww', 'thats']),
 WordList(['thats', 'bummer']),
 WordList(['bummer', 'shoulda']),
 WordList(['shoulda', 'got']),
 WordList(['got', 'david']),
 WordList(['david', 'carr']),
 WordList(['carr', 'third'])]

In [24]:
#Term frequency (the ratio of the count of a word present in a sentence, to the length of the sentence)
#TF = (Number of times term T appears in the particular row) / (number of terms in that row)
tf1 = (train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,school,1
1,update,1
2,cry,1
3,might,1
4,also,1
5,result,1
6,upset,1
7,facebook,1
8,blah,1
9,texting,1


In [25]:
#Inverse Document Frequency -a word is not of much use to us if it’s appearing in all the documents
#The IDF of each word is the log of the ratio of the total number of rows to the number of rows in which that word is present.
#IDF = log(N/n), where, N is the total number of rows and n is the number of rows in which the word was present.
#The higher the value of IDF, the more unique is the word.
for i,word in enumerate(tf1['words']):
  tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['tweet'].str.contains(word)])))

tf1

Unnamed: 0,words,tf,idf
0,school,1,4.204654
1,update,1,5.246086
2,cry,1,4.968136
3,might,1,5.080774
4,also,1,5.072674
5,result,1,6.795624
6,upset,1,5.905766
7,facebook,1,5.8371
8,blah,1,6.520164
9,texting,1,7.674679


In [26]:
#Term Frequency – Inverse Document Frequency (TF-IDF) -the multiplication of the TF and IDF
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,school,1,4.204654,4.204654
1,update,1,5.246086,5.246086
2,cry,1,4.968136,4.968136
3,might,1,5.080774,5.080774
4,also,1,5.072674,5.072674
5,result,1,6.795624,6.795624
6,upset,1,5.905766,5.905766
7,facebook,1,5.8371,5.8371
8,blah,1,6.520164,6.520164
9,texting,1,7.674679,7.674679


In [27]:
# the TF-IDF has penalized words like ‘facebook’, ‘texting’ because they are commonly occurring words. 
#However, it has given a high weight to words that will be very useful in determining the sentiment of the tweet.
#We don’t have to calculate TF and IDF every time beforehand and then multiply it to obtain TF-IDF. 
#Instead, sklearn has a separate function to directly obtain it:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['tweet'])

train_vect

<1048576x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 3871365 stored elements in Compressed Sparse Row format>

In [28]:
#Bag of Words -(BoW) refers to the representation of text which describes the presence of words within the text data
#Two similar text fields will contain similar kind of words, and will therefore have a similar bag of words.
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['tweet'])
train_bow

<1048576x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 4468660 stored elements in Compressed Sparse Row format>

In [29]:
#Sentiment Analysis
#First few tweets
train['tweet'][:5].apply(lambda x: TextBlob(x).sentiment)
#A tuple representing polarity and subjectivity of each tweet. 

0      (0.2, 0.45)
1       (0.0, 0.0)
2       (0.5, 0.5)
3       (0.2, 0.4)
4    (-0.625, 1.0)
Name: tweet, dtype: object

In [None]:
#Extract polarity as it indicates the sentiment as value nearer to 1 means a positive sentiment and values nearer to -1
train['sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['tweet','sentiment']].head()

In [None]:
#Word Embeddings -  the representation of text in the form of vectors
#Similar words will have a minimum distance between their vectors.
#Word2Vec models require a lot of text-we can train it on our training data or we can use the pre-trained word vectors developed by Google, Wiki, etc.
#Download the pretrained model from (https://nlp.stanford.edu/projects/glove/)
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
#load the above word2vec file as a model
from gensim.models import KeyedVectors # load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [None]:
#Example
#Let’s say our tweet contains a text saying ‘go away’.
#Obtain it’s word vector using the above model
model['go']

In [None]:
model['away']

In [None]:
#Take the average to represent the string ‘go away’ in the form of vectors
(model['go'] + model['away'])/2

In [None]:
#We have converted the string into a vector that can now be used as a feature in any modelling technique : )