In [1]:
import pandas as pd
from textblob import TextBlob
from nltk.tokenize.toktok import ToktokTokenizer
import re

In [2]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [3]:
tokenizer= ToktokTokenizer()

In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 5.8 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
import spacy
nlp= spacy.load('en_core_web_sm', disable=['near'])

In [6]:
TextBlob("She is a good girl").sentiment

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)

In [7]:
TextBlob("She is not a good girl").sentiment

Sentiment(polarity=-0.35, subjectivity=0.6000000000000001)

In [8]:
import os

In [9]:
os.chdir('C:\\Users\\User\\Documents\\')

In [10]:
train= pd.read_csv('train.csv')

In [11]:
train.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [12]:
label_0= train[train['label']==0].sample(n=5000)
label_1= train[train['label']==1].sample(n=5000)

In [13]:
train= pd.concat([label_1,label_0])
from sklearn.utils import shuffle
train= shuffle(train)

In [14]:
train.head()

Unnamed: 0,text,label
33070,I'm no Jane Austen purist but why make a film ...,0
18193,"I liked this movie for the most part, but have...",1
29579,"In a time of bad, if not plain awful, comedies...",1
3664,I have never seen a show as good as Full House...,1
30881,Fast paced and funny satire about that origina...,1


In [15]:
train.isnull().sum()

text     0
label    0
dtype: int64

In [16]:
import numpy as np
train.replace(r'^\s*$', np.nan, regex=True, inplace=True)
train.dropna(axis=0, how='any', inplace=True)

In [17]:
train.replace(to_replace= [r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex= True, inplace=True)
print('escape seq removed')

escape seq removed


In [18]:
train['text']= train['text'].str.encode('ascii','ignore').str.decode('ascii')
print('non-ascii removed')

non-ascii removed


In [19]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
def remove_punctuation(text):
    for punctuation in string.punctuation:
        text= text.replace(punctuation, '')
    return text
train['text']=train['text'].apply(remove_punctuation)

In [21]:
train

Unnamed: 0,text,label
33070,Im no Jane Austen purist but why make a film l...,0
18193,I liked this movie for the most part but have ...,1
29579,In a time of bad if not plain awful comedies K...,1
3664,I have never seen a show as good as Full House...,1
30881,Fast paced and funny satire about that origina...,1
...,...,...
28057,Jake Speed is a film that lacks one thing a c...,1
32625,This is a great movie Too bad it is not availa...,1
32013,I feel that this movie is different from so ma...,1
7913,Despite an overall pleasing plot and expensive...,0


In [22]:
import nltk
from nltk.corpus import stopwords

In [23]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [24]:
stopword_list= nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [25]:
def remove_stopwords(text, is_lower_case=False):
    tokens= tokenizer.tokenize(text)
    tokens= [token.strip() for token in tokens]
    if is_lower_case:
        filtered_token= [token for token in tokens if token not in stopword_list]
    else:
        filtered_token= [token for token in tokens if token.lower() not in stopword_list]
    filtered_text= ' '.join(filtered_token)
    return filtered_text

In [26]:
train['text']= train['text'].apply(remove_stopwords)

In [27]:
train

Unnamed: 0,text,label
33070,Im no Jane Austen purist make film like nothin...,0
18193,liked movie part say anyone else besides Bill ...,1
29579,time bad not plain awful comedies King Queens ...,1
3664,never seen show good Full House Full House put...,1
30881,Fast paced funny satire original reality TV so...,1
...,...,...
28057,Jake Speed film lacks one thing charismatic le...,1
32625,great movie bad not available home video,1
32013,feel movie different many others shows family ...,1
7913,Despite overall pleasing plot expensive produc...,0


In [28]:
def remove_special_char(text): 
    text= re.sub('[^a-zA-z0-9\s]', '',text)
    return text

In [29]:
train['text']= train['text'].apply(remove_special_char)

In [30]:
def remove_html(text):
    html_pattern= re.compile('<.*?>')
    return html_pattern.sub(r' ',text)

In [31]:
train['text']= train['text'].apply(remove_html)

In [32]:
def remove_URL(text):
    url= re.compile(r'http?://\S+|www\.\S+')
    return url.sub(r' ',text)

In [33]:
train['text']= train['text'].apply(remove_URL)

In [34]:
def remove_number(text):
    number= ''.join([i for i in text if not i.isdigit()])
    return number

In [35]:
train['text']= train['text'].apply(remove_number)

In [36]:
def cleanse(word):
    rx= re.compile(r'\D*\d')
    if rx.match(word):
        return ''
    return word
def remove_alphanumeric(strings):
    nstring= [" ".join(filter(None,(
    cleanse(word) for word in string.split()))) for string in strings.split()]
    str1= ' '.join(nstring)
    return str1

In [37]:
train['text']= train['text'].apply(remove_alphanumeric)

In [38]:
train

Unnamed: 0,text,label
33070,Im no Jane Austen purist make film like nothin...,0
18193,liked movie part say anyone else besides Bill ...,1
29579,time bad not plain awful comedies King Queens ...,1
3664,never seen show good Full House Full House put...,1
30881,Fast paced funny satire original reality TV so...,1
...,...,...
28057,Jake Speed film lacks one thing charismatic le...,1
32625,great movie bad not available home video,1
32013,feel movie different many others shows family ...,1
7913,Despite overall pleasing plot expensive produc...,0


In [40]:
def lemmatize_text(text):
    text= nlp(text)
    text= ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [41]:
train['text']= train['text'].apply(lemmatize_text)

In [42]:
train['sentiment']= train['text'].apply(lambda tweet: TextBlob(tweet).sentiment)

In [43]:
train

Unnamed: 0,text,label,sentiment
33070,I m no Jane Austen purist make film like nothi...,0,"(0.010427350427350435, 0.45017094017094017)"
18193,like movie part say anyone else besides Bill M...,1,"(0.1472222222222222, 0.4337962962962963)"
29579,time bad not plain awful comedy King Queens br...,1,"(0.19160401002506267, 0.5600250626566415)"
3664,never see show good Full House Full House put ...,1,"(0.3142461704961706, 0.5584895367790104)"
30881,fast pace funny satire original reality tv soa...,1,"(0.22164502164502162, 0.6398809523809523)"
...,...,...,...
28057,Jake speed film lack one thing charismatic lea...,1,"(0.10292574656981439, 0.5697134786117839)"
32625,great movie bad not available home video,1,"(-0.03333333333333327, 0.6055555555555555)"
32013,feel movie different many other show family gi...,1,"(0.13106060606060604, 0.5143939393939394)"
7913,despite overall pleasing plot expensive produc...,0,"(0.0048701298701298865, 0.3293650793650793)"


In [44]:
sentiment_series= train['sentiment'].tolist()

In [45]:
columns= ['polarity','subjectivity']
df1= pd.DataFrame(sentiment_series, columns=columns, index= train.index)

In [46]:
df1

Unnamed: 0,polarity,subjectivity
33070,0.010427,0.450171
18193,0.147222,0.433796
29579,0.191604,0.560025
3664,0.314246,0.558490
30881,0.221645,0.639881
...,...,...
28057,0.102926,0.569713
32625,-0.033333,0.605556
32013,0.131061,0.514394
7913,0.004870,0.329365


In [47]:
result= pd.concat([train,df1],axis=1)

In [48]:
result.drop(['sentiment'], axis=1, inplace=True)

In [49]:
result.loc[result['polarity']>=0.3, 'Sentiment']= "Positive"
result.loc[result['polarity']<0.3, 'Sentiment']= "Negative"

In [50]:
result

Unnamed: 0,text,label,polarity,subjectivity,Sentiment
33070,I m no Jane Austen purist make film like nothi...,0,0.010427,0.450171,Negative
18193,like movie part say anyone else besides Bill M...,1,0.147222,0.433796,Negative
29579,time bad not plain awful comedy King Queens br...,1,0.191604,0.560025,Negative
3664,never see show good Full House Full House put ...,1,0.314246,0.558490,Positive
30881,fast pace funny satire original reality tv soa...,1,0.221645,0.639881,Negative
...,...,...,...,...,...
28057,Jake speed film lack one thing charismatic lea...,1,0.102926,0.569713,Negative
32625,great movie bad not available home video,1,-0.033333,0.605556,Negative
32013,feel movie different many other show family gi...,1,0.131061,0.514394,Negative
7913,despite overall pleasing plot expensive produc...,0,0.004870,0.329365,Negative
