In [1]:
import pandas as pd
train = pd.read_csv('train_E6oV3lV.csv')

In [2]:
#1. BASIC FEATURE EXTRACTION
#(1.1)
#extracting number of words in each tweet
#negative sentiments generally have less number of words
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" ")))
train[['tweet','word_count']].head()

Unnamed: 0,tweet,word_count
0,@user when a father is dysfunctional and is s...,21
1,@user @user thanks for #lyft credit i can't us...,22
2,bihday your majesty,5
3,#model i love u take with u all the time in ...,17
4,factsguide: society now #motivation,8


In [3]:
#(1.2)
#number of characters in each tweet
#using length of the tweet
train['char_count'] = train['tweet'].str.len() ## this also includes spaces
train[['tweet','char_count']].head()

Unnamed: 0,tweet,char_count
0,@user when a father is dysfunctional and is s...,102
1,@user @user thanks for #lyft credit i can't us...,122
2,bihday your majesty,21
3,#model i love u take with u all the time in ...,86
4,factsguide: society now #motivation,39


In [4]:
#(1.3)
#average length of the tweet
#sum of length of all the words/the total length of the tweet
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['tweet'].apply(lambda x: avg_word(x))
train[['tweet','avg_word']].head()

Unnamed: 0,tweet,avg_word
0,@user when a father is dysfunctional and is s...,4.555556
1,@user @user thanks for #lyft credit i can't us...,5.315789
2,bihday your majesty,5.666667
3,#model i love u take with u all the time in ...,4.928571
4,factsguide: society now #motivation,8.0


In [5]:
#(1.4)
#number of stopwords
#sometimes calculating the number of stopwords can also give us some extra information which we might have been losing before
from nltk.corpus import stopwords
stop = stopwords.words('english')

train['stopwords'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['tweet','stopwords']].head()

Unnamed: 0,tweet,stopwords
0,@user when a father is dysfunctional and is s...,10
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,1
3,#model i love u take with u all the time in ...,5
4,factsguide: society now #motivation,1


In [6]:
#(1.5)
#calculating the number of hashtags or mentions present in it
#helps in extracting extra information from our text data
#using ‘starts with’ function because hashtags (or mentions) always appear at the beginning of a word
train['hastags'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['tweet','hastags']].head()

Unnamed: 0,tweet,hastags
0,@user when a father is dysfunctional and is s...,1
1,@user @user thanks for #lyft credit i can't us...,3
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,1
4,factsguide: society now #motivation,1


In [7]:
#(1.6)
#calculating number of numerics
train['numerics'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['tweet','numerics']].head()

Unnamed: 0,tweet,numerics
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [8]:
#(1.7)
#number of uppercase words
#Anger or rage is often expressed by writing in UPPERCASE words 
#which makes this a necessary operation to identify those words
train['upper'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['tweet','upper']].head()

Unnamed: 0,tweet,upper
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [9]:
#2. BASIC PRE-PROCESSING
#(2.1) Lowercase - transforming data to lowercase
#helps avoid having multiple copies of the same word(careful with Apple and apple)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['tweet'].head()

0    @user when a father is dysfunctional and is so...
1    @user @user thanks for #lyft credit i can't us...
2                                  bihday your majesty
3    #model i love u take with u all the time in ur...
4                  factsguide: society now #motivation
Name: tweet, dtype: object

In [10]:
#(2.2) Remove punctuations (since it doesnt add any extra information)
train['tweet'] = train['tweet'].str.replace('[^\w\s]','')
train['tweet'].head()

0    user when a father is dysfunctional and is so ...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model i love u take with u all the time in urð...
4                    factsguide society now motivation
Name: tweet, dtype: object

In [11]:
#(2.3) remove stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['tweet'].head()

0    user father dysfunctional selfish drags kids d...
1    user user thanks lyft credit cant use cause do...
2                                       bihday majesty
3                model love u take u time urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [12]:
#(2.4) Common Word Removal
#let’s check the 10 most frequently occurring words in our text data
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10]

In [13]:
freq

user     17473
love      2647
ð         2511
day       2199
â         1797
happy     1663
amp       1582
im        1139
u         1136
time      1110
dtype: int64

In [14]:
#take call to remove
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [15]:
#(2.5) Rare word Removal
#Because they’re so rare, the association between them and other words is dominated by noise
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]

In [16]:
freq

onðð                    1
spencer                 1
maytheforcebewithyou    1
learningâ               1
pfyp                    1
thehuntingpay           1
getityourself           1
robben                  1
basquetball             1
160                     1
dtype: int64

In [17]:
#take call to remove rare words
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['tweet'].head()

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [19]:
#(2.6) Spelling Correction
#to reduce multiple copies of words
#eg. “Analytics” and “analytcs” will be treated as different words even if they are used in the same sense
#achieved using textblob library
from textblob import TextBlob
train['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))
#it is time-consuming
#Therefore, just for the purposes of learning, applying it on only the first 5 rows
#Not always accurate
#Words are often used in their abbreviated form
#eg.‘your’ is used as ‘ur’ but using this tramsforms it to 'or' instead

0    father dysfunctional selfish drags kiss dysfun...
1    thanks left credit can use cause dont offer wh...
2                                       midday majesty
3                               model take or ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [20]:
#(2.7) tokenization
#Conversion into words or sentences
#textblob library to first transform our tweets into a blob and then converted them into a series of words
TextBlob(train['tweet'][1]).words

WordList(['thanks', 'lyft', 'credit', 'cant', 'use', 'cause', 'dont', 'offer', 'wheelchair', 'vans', 'pdx', 'disapointed', 'getthanked'])

In [21]:
#(2.8) Stemming
#Conversion into root words
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['tweet'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0        father dysfunct selfish drag kid dysfunct run
1    thank lyft credit cant use caus dont offer whe...
2                                       bihday majesti
3                              model take urð ðððð ððð
4                              factsguid societi motiv
Name: tweet, dtype: object

In [22]:
#(2.9) Lemmatization
# converts the word into its root word, rather than just stripping the suffices
# makes use of the vocabulary and does a morphological analysis to obtain the root word
from textblob import Word
train['tweet'] = train['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['tweet'].head()

0    father dysfunctional selfish drag kid dysfunct...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [23]:
#3 ADVANCED TEXT PROCESSING - extracting features
#(3.1) N-Grams
#Extracting bigrams
TextBlob(train['tweet'][0]).ngrams(2)

[WordList(['father', 'dysfunctional']),
 WordList(['dysfunctional', 'selfish']),
 WordList(['selfish', 'drag']),
 WordList(['drag', 'kid']),
 WordList(['kid', 'dysfunction']),
 WordList(['dysfunction', 'run'])]

In [24]:
#(3.2) Term Frequency
#TF = (Number of times term T appears in the particular row) / (number of terms in that row)
tf1 = (train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']

In [25]:
tf1

Unnamed: 0,words,tf
0,thanks,1
1,pdx,1
2,credit,1
3,wheelchair,1
4,cant,1
5,dont,1
6,use,1
7,van,1
8,lyft,1
9,cause,1


In [30]:
#(3.3) IDF Inverse Document Frequency
#A word is not of much use to us if it’s appearing in all the documents
#Therefore, IDF of each word is log of (total number of rows)/(number of rows in which that word is present)
#IDF = log(N/n), 
#where, N is the total number of rows, n is the number of rows in which the word was present
import numpy as np
for i,word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['tweet'].str.contains(word)])))

In [31]:
tf1
#higher value of idf indicates how unique the word is

Unnamed: 0,words,tf,idf
0,thanks,1,4.597751
1,pdx,1,8.762865
2,credit,1,7.327781
3,wheelchair,1,9.273691
4,cant,1,3.538194
5,dont,1,3.745585
6,use,1,3.552287
7,van,1,5.236505
8,lyft,1,8.762865
9,cause,1,5.690172


In [32]:
#(3.4)Term Frequency – Inverse Document Frequency = (TF)*(IDF)
#penalises commonly occuring words
#gives high weight to words like 'disappointed' because they help in determining the sentiment of the tweet
#instead of using this: 
#tf1['tfidf'] = tf1['tf'] * tf1['idf']
#tf1, sklearn package can directly be used
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['tweet'])

In [33]:
train_vect
#pre-processing steps like lower-casing and removal of stopwords can also be performed before this

<31962x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 114036 stored elements in Compressed Sparse Row format>

In [34]:
#(3.5)Bag of Words
#refers to representation of text which describes the presence of words within the text data
#two similar text fields will contain similar kind of words, and will therefore have a similar bag of word
#from the text alone we can learn something about the meaning of the document
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['tweet'])

In [35]:
train_bow

<31962x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 128386 stored elements in Compressed Sparse Row format>

In [36]:
#(3.6.i) Checking the sentiment of first few tweets
train['tweet'][:5].apply(lambda x: TextBlob(x).sentiment)
#Here, we only extract polarity
#indicated: value nearer to 1 = positive sentiment and values nearer to -1 = negative sentiment

0    (-0.3, 0.5354166666666667)
1                    (0.2, 0.2)
2                    (0.0, 0.0)
3                    (0.0, 0.0)
4                    (0.0, 0.0)
Name: tweet, dtype: object

In [37]:
#using above idea as a feature for building a machine learning model
train['sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['tweet','sentiment']].head()

Unnamed: 0,tweet,sentiment
0,father dysfunctional selfish drag kid dysfunct...,-0.3
1,thanks lyft credit cant use cause dont offer w...,0.2
2,bihday majesty,0.0
3,model take urð ðððð ððð,0.0
4,factsguide society motivation,0.0


In [38]:
#(3.7) Word Embeddings
#representation of text in the form of vectors
#similar words will have a minimum distance between their vectors
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.840B.300d.txt'
word2vec_output_file = 'glove.840B.300d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(2196017, 300)

In [39]:
#loading the above file as a model
from gensim.models import KeyedVectors # load the Stanford GloVe model
filename = 'glove.840B.300d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [40]:
model['go']

array([  1.38929993e-01,  -1.90559998e-02,  -3.38910013e-01,
         1.21509999e-01,   3.65229994e-01,  -1.73910007e-01,
        -2.67350003e-02,  -5.03350012e-02,   2.47429997e-01,
         2.45309997e+00,  -4.21130002e-01,   2.36320004e-01,
         2.05129996e-01,  -1.09369997e-02,  -1.14799999e-01,
        -3.76479998e-02,  -1.34399995e-01,   8.61240029e-01,
        -3.58029991e-01,   9.25249979e-02,   2.80750006e-01,
         1.36490002e-01,   2.08189994e-01,   6.02059998e-02,
        -1.82290003e-01,   1.01719998e-01,  -1.31999999e-01,
        -3.15979987e-01,   2.22409993e-01,  -1.90760002e-01,
        -1.08840000e-02,   1.69880003e-01,   8.03450029e-03,
         1.33369997e-01,   1.77239999e-01,  -1.91620007e-01,
         3.36809993e-01,   3.01860005e-01,   6.16540015e-02,
         7.69060012e-03,  -5.44059992e-01,   5.01420014e-02,
        -4.31150012e-02,  -2.62410015e-01,   4.74620014e-02,
         3.36699992e-01,  -2.86489993e-01,  -2.74140000e-01,
         2.67760009e-02,

In [41]:
model['away']

array([  3.53040010e-01,   1.03629999e-01,  -2.32190005e-02,
         1.69939995e-01,   2.22729996e-01,  -2.68660009e-01,
        -4.60290015e-01,  -1.61709994e-01,  -2.63289995e-02,
         2.92149997e+00,   4.22010012e-02,   3.83540004e-01,
        -9.18510035e-02,  -2.02150002e-01,  -1.59960002e-01,
        -2.67069995e-01,  -2.30960008e-02,   7.17239976e-01,
        -1.53679997e-01,   1.96669996e-01,   9.34500024e-02,
         2.24309996e-01,  -4.59459983e-02,   2.05160007e-01,
         4.52539995e-02,  -6.43279999e-02,  -6.82540014e-02,
        -2.39140004e-01,  -9.19890031e-02,  -3.15789990e-02,
        -4.46229987e-02,   2.67910004e-01,  -7.01160014e-01,
        -1.12760000e-01,   2.36560002e-01,  -6.27140030e-02,
        -8.37529972e-02,   2.97930002e-01,   7.55629987e-02,
         1.89549997e-01,   9.14330035e-03,  -1.26230001e-01,
        -2.27799997e-01,  -6.04370013e-02,   1.51380002e-01,
         5.64199984e-02,  -3.10369998e-01,  -2.93850005e-01,
        -5.61200023e-01,