#  Importing important library

In [1]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Collecting DataSet

In [2]:
df = pd.read_csv('train.csv', error_bad_lines=False, encoding='ISO-8859-1')
df.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [3]:
df.shape

(99989, 3)

# Preprocessing Steps


# 1. Counting the number of word

In [4]:
df['word_count']=df['SentimentText'].apply(lambda x: len(str(x).split(" ")))
df[['SentimentText', 'word_count']].head()

Unnamed: 0,SentimentText,word_count
0,is so sad for my APL frie...,28
1,I missed the New Moon trail...,25
2,omg its already 7:30 :O,19
3,.. Omgaga. Im sooo im gunna CRy. I'...,36
4,i think mi bf is cheating on me!!! ...,24


# 2.Counting Average Word Lentgh

In [5]:
def avg_word(sentence):
    words = sentence.split()
    return(sum(len(word) for word in words)/len(words))


df['avg_word']=df['SentimentText'].apply(lambda x:avg_word(x))
df[['SentimentText', 'avg_word']].head(10)

Unnamed: 0,SentimentText,avg_word
0,is so sad for my APL frie...,4.857143
1,I missed the New Moon trail...,4.5
2,omg its already 7:30 :O,3.8
3,.. Omgaga. Im sooo im gunna CRy. I'...,3.88
4,i think mi bf is cheating on me!!! ...,3.333333
5,or i just worry too much?,3.333333
6,Juuuuuuuuuuuuuuuuussssst Chillin!!,16.5
7,Sunny Again Work Tomorrow :-| ...,4.857143
8,handed in my uniform today . i miss you ...,3.8
9,hmmmm.... i wonder how she my number @-),4.125


# 3.Counting Number of StopWords

In [6]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['stopwords']=df['SentimentText'].apply(lambda x:len([x for x in x.split() if x in stop]))
df[['SentimentText', 'stopwords']].head(10)

Unnamed: 0,SentimentText,stopwords
0,is so sad for my APL frie...,4
1,I missed the New Moon trail...,1
2,omg its already 7:30 :O,1
3,.. Omgaga. Im sooo im gunna CRy. I'...,7
4,i think mi bf is cheating on me!!! ...,3
5,or i just worry too much?,4
6,Juuuuuuuuuuuuuuuuussssst Chillin!!,0
7,Sunny Again Work Tomorrow :-| ...,0
8,handed in my uniform today . i miss you ...,4
9,hmmmm.... i wonder how she my number @-),4


# 4. Removing of stop words

In [7]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['SentimentText'] =df['SentimentText'].apply(lambda x:" ".join(x for x in x.split() if x not in stop))
df['SentimentText'].head(10)

0                          sad APL friend.............
1                         I missed New Moon trailer...
2                                  omg already 7:30 :O
3    .. Omgaga. Im sooo im gunna CRy. I've dentist ...
4                       think mi bf cheating me!!! T_T
5                                          worry much?
6                   Juuuuuuuuuuuuuuuuussssst Chillin!!
7             Sunny Again Work Tomorrow :-| TV Tonight
8                  handed uniform today . miss already
9                          hmmmm.... wonder number @-)
Name: SentimentText, dtype: object

# 5. Counting Number of Special Character

In [8]:
df['hastags']=df['SentimentText'].apply(lambda x:len([x for x in x.split() if x.startswith('#')]))
df[['SentimentText', 'hastags']].head(10)

Unnamed: 0,SentimentText,hastags
0,sad APL friend.............,0
1,I missed New Moon trailer...,0
2,omg already 7:30 :O,0
3,.. Omgaga. Im sooo im gunna CRy. I've dentist ...,0
4,think mi bf cheating me!!! T_T,0
5,worry much?,0
6,Juuuuuuuuuuuuuuuuussssst Chillin!!,0
7,Sunny Again Work Tomorrow :-| TV Tonight,0
8,handed uniform today . miss already,0
9,hmmmm.... wonder number @-),0


# 6. Convert in to LowerCase

In [9]:
df['SentimentText'] = df['SentimentText'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['SentimentText'].head()

0                          sad apl friend.............
1                         i missed new moon trailer...
2                                  omg already 7:30 :o
3    .. omgaga. im sooo im gunna cry. i've dentist ...
4                       think mi bf cheating me!!! t_t
Name: SentimentText, dtype: object

# 7. Removing Punctuation

In [10]:
df['SentimentText']=df['SentimentText'].str.replace('[^\w\s]','')
df['SentimentText'].head(10)

0                                       sad apl friend
1                            i missed new moon trailer
2                                    omg already 730 o
3     omgaga im sooo im gunna cry ive dentist since...
4                          think mi bf cheating me t_t
5                                           worry much
6                     juuuuuuuuuuuuuuuuussssst chillin
7                sunny again work tomorrow  tv tonight
8                   handed uniform today  miss already
9                                 hmmmm wonder number 
Name: SentimentText, dtype: object

# 8. Removing Common word

In [11]:
freq =pd.Series(' '.join(df['SentimentText']).split()).value_counts()[:10]
freq

i       33315
im      11010
good     5790
you      5524
like     5473
get      5272
lol      5043
u        5021
know     4529
it       4443
dtype: int64

In [12]:
freq =list(freq.index)
df['SentimentText'] =df['SentimentText'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
df['SentimentText'].head(10)

0                                       sad apl friend
1                              missed new moon trailer
2                                    omg already 730 o
3    omgaga sooo gunna cry ive dentist since 11 sup...
4                          think mi bf cheating me t_t
5                                           worry much
6                     juuuuuuuuuuuuuuuuussssst chillin
7                 sunny again work tomorrow tv tonight
8                    handed uniform today miss already
9                                  hmmmm wonder number
Name: SentimentText, dtype: object

In [13]:
freq = pd.Series(' '.join(df['SentimentText']).split()).value_counts()[-10:]
freq

wtff                 1
yhh                  1
tracked              1
httpchilpit528243    1
scaglione            1
bulavalava           1
bettinagon           1
yeahhhhh             1
quotzebraquot        1
ajdpadbury           1
dtype: int64

# 9. Spelling Correction

In [14]:
from textblob import TextBlob
df['SentimentText'][:10].apply(lambda x:str(TextBlob(x).correct()))

0                                       sad all friend
1                              missed new moon trailer
2                                    org already 730 o
3    omaha soon unna cry give dentist since 11 supp...
4                          think mi of cheating me tut
5                                           worry much
6                       juuuuuuuuuuuuuuuuussssst chill
7                 sunny again work tomorrow to tonight
8                    handed uniform today miss already
9                                  homme wonder number
Name: SentimentText, dtype: object

# 9. Tokenization of words

In [15]:
TextBlob(df['SentimentText'][1]).words # here tokenize the 1 index value


WordList(['missed', 'new', 'moon', 'trailer'])

# 10.Stemming of words

In [16]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
df['SentimentText'][:10].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0                                       sad apl friend
1                                miss new moon trailer
2                                    omg alreadi 730 o
3    omgaga sooo gunna cri ive dentist sinc 11 supo...
4                             think mi bf cheat me t_t
5                                           worri much
6                     juuuuuuuuuuuuuuuuussssst chillin
7                 sunni again work tomorrow tv tonight
8                      hand uniform today miss alreadi
9                                  hmmmm wonder number
Name: SentimentText, dtype: object

# 11. Lemmatization of words

In [17]:
from textblob import Word
df['SentimentText'] =df['SentimentText'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df['SentimentText'].head(10)

0                                       sad apl friend
1                              missed new moon trailer
2                                    omg already 730 o
3    omgaga sooo gunna cry ive dentist since 11 sup...
4                          think mi bf cheating me t_t
5                                           worry much
6                     juuuuuuuuuuuuuuuuussssst chillin
7                 sunny again work tomorrow tv tonight
8                    handed uniform today miss already
9                                  hmmmm wonder number
Name: SentimentText, dtype: object

# 12. Extracting bigrams

In [18]:
TextBlob(df['SentimentText'][0]).ngrams(2)

[WordList(['sad', 'apl']), WordList(['apl', 'friend'])]

# Term Frequency – Inverse Document Frequency (TF-IDF)

In [19]:
tf1= (df['SentimentText'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns =['words','tf']
tf1

Unnamed: 0,words,tf
0,new,1
1,trailer,1
2,moon,1
3,missed,1


In [20]:
for i, word in enumerate(tf1['words']):
    tf1.loc[i, 'idf']=np.log(df.shape[0]/(len(df[df['SentimentText'].str.contains(word)])))
tf1

Unnamed: 0,words,tf,idf
0,new,1,3.598563
1,trailer,1,7.278709
2,moon,1,6.29246
3,missed,1,4.950371


In [21]:
tf1['tfidf'] = tf1['tf']*tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,new,1,3.598563,3.598563
1,trailer,1,7.278709,7.278709
2,moon,1,6.29246,6.29246
3,missed,1,4.950371,4.950371


# Feature Extraction

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', 
                        stop_words ='english',ngram_range=(1,1)) 
train_vect = tfidf.fit_transform(df['SentimentText'])
train_vect

<99989x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 343433 stored elements in Compressed Sparse Row format>

# Sentimental Analysis

In [23]:
df['SentimentText'][:5].apply(lambda x:TextBlob(x).sentiment)

0                                   (-0.5, 1.0)
1    (0.13636363636363635, 0.45454545454545453)
2                                    (0.0, 0.0)
3                                    (0.0, 0.0)
4                                    (0.0, 0.0)
Name: SentimentText, dtype: object

In [24]:
df['sentiment']=df['SentimentText'].apply(lambda x:TextBlob(x).sentiment[0])
df[['SentimentText','sentiment']].head(10)

Unnamed: 0,SentimentText,sentiment
0,sad apl friend,-0.5
1,missed new moon trailer,0.136364
2,omg already 730 o,0.0
3,omgaga sooo gunna cry ive dentist since 11 sup...,0.0
4,think mi bf cheating me t_t,0.0
5,worry much,0.2
6,juuuuuuuuuuuuuuuuussssst chillin,0.0
7,sunny again work tomorrow tv tonight,0.0
8,handed uniform today miss already,0.0
9,hmmmm wonder number,0.0
