In [38]:
import pandas as pd
from textblob import TextBlob
from nltk.tokenize.toktok import ToktokTokenizer
import re
tokenizer=ToktokTokenizer()
import spacy
nlp=spacy.load('en_core_web_sm',disable=['ner'])

# **Polarity and Subjectivity**
TextBlob returns polarity and subjectivity of a sentence. Polarity lies between [-1,1], -1 defines a negative sentiment and 1 defines a positive sentiment. Negation words reverse the polarity. TextBlob has semantic labels that help with fine-grained analysis. For example — emoticons, exclamation mark, emojis, etc. Subjectivity lies between [0,1]. Subjectivity quantifies the amount of personal opinion and factual information contained in the text. The higher subjectivity means that the text contains personal opinion rather than factual information.

In [39]:
TextBlob("he is very good boy").sentiment

Sentiment(polarity=0.9099999999999999, subjectivity=0.7800000000000001)

In [40]:
TextBlob("he is not a  good boy").sentiment

Sentiment(polarity=-0.35, subjectivity=0.6000000000000001)

In [41]:
train=pd.read_csv('/content/Train.csv')
train

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [42]:
#Lowering data from 40000 to 10000
label_0=train[train['label']==0].sample(n=5000)
label_1=train[train['label']==1].sample(n=5000)

In [43]:
train=pd.concat([label_1,label_0])
from sklearn.utils import shuffle
train=shuffle(train)

In [44]:
train

Unnamed: 0,text,label
15498,I will just start with some quotes from other ...,0
4142,I've seen this programme a few times and the m...,0
11789,"I saw this movie with my mother, and I loved i...",1
19037,I taped The Morrison Murders on Lifetime Movie...,1
27522,Yakitate! Ja-pan (translated as Fresh Baked! J...,1
...,...,...
5738,"Even after nearly 20 years apart, the original...",1
23214,"""I'm a cartoon!"" ""You're an illustration!"" wha...",0
16886,In addition to all the negative reviews: I was...,0
20045,Wait... wait... wait... wait... wait... wait.....,0


Here data has 2 labels ie 0 and 1 .
Here 1 stands for POSITIVE and 0 stands for NEGATIVE

# **Data Preprocessing**

In [45]:
train.isnull().sum()

text     0
label    0
dtype: int64

In [46]:
import numpy as np
train.replace(r'^\s*$',np.nan,regex=True,inplace=True)#for ignoring blank space
train.dropna(axis=0,how='any',inplace=True)

In [47]:
train.replace(to_replace=[r"\\t|\\n|\\r","\t|\n|\r"], value=["",""],regex=True,inplace=True)
print('escape seq removed')

escape seq removed


In [48]:
train

Unnamed: 0,text,label
15498,I will just start with some quotes from other ...,0
4142,I've seen this programme a few times and the m...,0
11789,"I saw this movie with my mother, and I loved i...",1
19037,I taped The Morrison Murders on Lifetime Movie...,1
27522,Yakitate! Ja-pan (translated as Fresh Baked! J...,1
...,...,...
5738,"Even after nearly 20 years apart, the original...",1
23214,"""I'm a cartoon!"" ""You're an illustration!"" wha...",0
16886,In addition to all the negative reviews: I was...,0
20045,Wait... wait... wait... wait... wait... wait.....,0


In [49]:
train['text']=train['text'].str.encode('ascii','ignore').str.decode('ascii')
print('non-ascii data removed')

non-ascii data removed


In [50]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [51]:
def remove_punctuation(text):
  for punctuation in string.punctuation:
    text=text.replace(punctuation,'')
  return text

train['text']=train['text'].apply(remove_punctuation)

In [52]:
train

Unnamed: 0,text,label
15498,I will just start with some quotes from other ...,0
4142,Ive seen this programme a few times and the mo...,0
11789,I saw this movie with my mother and I loved it...,1
19037,I taped The Morrison Murders on Lifetime Movie...,1
27522,Yakitate Japan translated as Fresh Baked Japan...,1
...,...,...
5738,Even after nearly 20 years apart the original ...,1
23214,Im a cartoon Youre an illustration what does t...,0
16886,In addition to all the negative reviews I was ...,0
20045,Wait wait wait wait wait wait WHAT This movie ...,0


In [53]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
stopword_list=nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [55]:
def custom_remove_stopwords(text,is_lower_case=False):
  tokens=tokenizer.tokenize(text)
  tokens=[token.strip() for token in tokens]
  if is_lower_case:
    filtered_tokens=[token for token in tokens if token not in stopword_list]
  else:
    filtered_tokens=[token for token in tokens if token.lower() not in stopword_list]
  filtered_text=' '.join(filtered_tokens)
  return filtered_text


In [56]:
train['text']=train['text'].apply(custom_remove_stopwords)

In [57]:
train

Unnamed: 0,text,label
15498,start quotes reviewers describes bestbr br eas...,0
4142,Ive seen programme times see less like Jamie L...,0
11789,saw movie mother loved sweet story Not mention...,1
19037,taped Morrison Murders Lifetime Movie network ...,1
27522,Yakitate Japan translated Fresh Baked Japanese...,1
...,...,...
5738,Even nearly 20 years apart original members Bl...,1
23214,Im cartoon Youre illustration suppose mean plo...,0
16886,addition negative reviews amazed see drop hat ...,0
20045,Wait wait wait wait wait wait movie terrible a...,0


In [58]:
import re
def remove_special_characters(text):
  text=re.sub('[^a-zA-z0-9\s]','',text)
  return text

In [59]:
train['text']=train['text'].apply(remove_special_characters)

In [60]:
train

Unnamed: 0,text,label
15498,start quotes reviewers describes bestbr br eas...,0
4142,Ive seen programme times see less like Jamie L...,0
11789,saw movie mother loved sweet story Not mention...,1
19037,taped Morrison Murders Lifetime Movie network ...,1
27522,Yakitate Japan translated Fresh Baked Japanese...,1
...,...,...
5738,Even nearly 20 years apart original members Bl...,1
23214,Im cartoon Youre illustration suppose mean plo...,0
16886,addition negative reviews amazed see drop hat ...,0
20045,Wait wait wait wait wait wait movie terrible a...,0


In [61]:
def remove_html(text):
  html_pattern=re.compile('<.*?>')
  return html_pattern.sub(r' ',text)

In [62]:
train['text']=train['text'].apply(remove_html)

In [63]:
train

Unnamed: 0,text,label
15498,start quotes reviewers describes bestbr br eas...,0
4142,Ive seen programme times see less like Jamie L...,0
11789,saw movie mother loved sweet story Not mention...,1
19037,taped Morrison Murders Lifetime Movie network ...,1
27522,Yakitate Japan translated Fresh Baked Japanese...,1
...,...,...
5738,Even nearly 20 years apart original members Bl...,1
23214,Im cartoon Youre illustration suppose mean plo...,0
16886,addition negative reviews amazed see drop hat ...,0
20045,Wait wait wait wait wait wait movie terrible a...,0


In [64]:
def remove_URL(text):
  url=re.compile(r'https?://\S+|www\.\S+')
  return url.sub(r' ',text)

train['text']=train['text'].apply(remove_URL)

In [66]:
def remove_numbers(text):
  text=''.join([i for i in text if not i.isdigit()])
  return text

train['text']=train['text'].apply(remove_numbers)

In [67]:
def cleanse(word):
  rx=re.compile(r'\D*\d')
  if rx.match(word):
    return ''
  return word

def remove_alphanumeric(strings):
  nstrings=["".join(filter(None,(cleanse(word) for word in string.split()))) for string in strings.split()]
  str1=' '.join(nstrings)
  return str1

train['text']=train['text'].apply(remove_alphanumeric)

In [68]:
train

Unnamed: 0,text,label
15498,start quotes reviewers describes bestbr br eas...,0
4142,Ive seen programme times see less like Jamie L...,0
11789,saw movie mother loved sweet story Not mention...,1
19037,taped Morrison Murders Lifetime Movie network ...,1
27522,Yakitate Japan translated Fresh Baked Japanese...,1
...,...,...
5738,Even nearly years apart original members Black...,1
23214,Im cartoon Youre illustration suppose mean plo...,0
16886,addition negative reviews amazed see drop hat ...,0
20045,Wait wait wait wait wait wait movie terrible a...,0


In [69]:
def lemmatize_text(text):
  text=nlp(text)
  text=' '.join([word.lemma_ if word.lemma_ !='-PRON-' else word.text for word in text])
  return text

In [70]:
train['text']=train['text'].apply(lemmatize_text)

In [71]:
train['sentiment']=train['text'].apply(lambda tweet : TextBlob(tweet).sentiment)

In [74]:
train

Unnamed: 0,text,label,sentiment
15498,start quote reviewer describe bestbr br easily...,0,"(-0.006122448979591838, 0.5185374149659865)"
4142,I ve see programme time see less like Jamie Ly...,0,"(0.08595238095238099, 0.5234126984126983)"
11789,see movie mother love sweet story not mention ...,1,"(0.2666666666666667, 0.7888888888888889)"
19037,tape Morrison Murders Lifetime Movie network w...,1,"(0.4388888888888889, 0.5666666666666667)"
27522,Yakitate Japan translate Fresh Baked japanese ...,1,"(0.1761309523809524, 0.4240277777777777)"
...,...,...,...
5738,even nearly year apart original member Black S...,1,"(0.3416666666666667, 0.6125)"
23214,I m cartoon you re illustration suppose mean p...,0,"(-0.061607142857142846, 0.5895833333333333)"
16886,addition negative review amazed see drop hat s...,0,"(-0.03262987012987014, 0.4446428571428571)"
20045,wait wait wait wait wait wait movie terrible a...,0,"(-0.3571969696969697, 0.4443181818181818)"


In [72]:
sentiment_series=train['sentiment'].tolist()

In [77]:
columns=['polarity','subjectivity']
df1=pd.DataFrame(sentiment_series,columns=columns,index=train.index)
df1

Unnamed: 0,polarity,subjectivity
15498,-0.006122,0.518537
4142,0.085952,0.523413
11789,0.266667,0.788889
19037,0.438889,0.566667
27522,0.176131,0.424028
...,...,...
5738,0.341667,0.612500
23214,-0.061607,0.589583
16886,-0.032630,0.444643
20045,-0.357197,0.444318


In [76]:
result=pd.concat([train,df1],axis=1)

In [78]:
result.drop(['sentiment'],axis=1,inplace=True)

In [79]:
result.loc[result['polarity']>=0.3,'Sentiment']="Positive"
result.loc[result['polarity']<0.3,'Sentiment']="Negative"

In [80]:
result

Unnamed: 0,text,label,polarity,subjectivity,Sentiment
15498,start quote reviewer describe bestbr br easily...,0,-0.006122,0.518537,Negative
4142,I ve see programme time see less like Jamie Ly...,0,0.085952,0.523413,Negative
11789,see movie mother love sweet story not mention ...,1,0.266667,0.788889,Negative
19037,tape Morrison Murders Lifetime Movie network w...,1,0.438889,0.566667,Positive
27522,Yakitate Japan translate Fresh Baked japanese ...,1,0.176131,0.424028,Negative
...,...,...,...,...,...
5738,even nearly year apart original member Black S...,1,0.341667,0.612500,Positive
23214,I m cartoon you re illustration suppose mean p...,0,-0.061607,0.589583,Negative
16886,addition negative review amazed see drop hat s...,0,-0.032630,0.444643,Negative
20045,wait wait wait wait wait wait movie terrible a...,0,-0.357197,0.444318,Negative


In [81]:
result.loc[result['label']==1,'Sentiment_label']=1
result.loc[result['label']==0,'Sentiment_label']=0

In [82]:
result

Unnamed: 0,text,label,polarity,subjectivity,Sentiment,Sentiment_label
15498,start quote reviewer describe bestbr br easily...,0,-0.006122,0.518537,Negative,0.0
4142,I ve see programme time see less like Jamie Ly...,0,0.085952,0.523413,Negative,0.0
11789,see movie mother love sweet story not mention ...,1,0.266667,0.788889,Negative,1.0
19037,tape Morrison Murders Lifetime Movie network w...,1,0.438889,0.566667,Positive,1.0
27522,Yakitate Japan translate Fresh Baked japanese ...,1,0.176131,0.424028,Negative,1.0
...,...,...,...,...,...,...
5738,even nearly year apart original member Black S...,1,0.341667,0.612500,Positive,1.0
23214,I m cartoon you re illustration suppose mean p...,0,-0.061607,0.589583,Negative,0.0
16886,addition negative review amazed see drop hat s...,0,-0.032630,0.444643,Negative,0.0
20045,wait wait wait wait wait wait movie terrible a...,0,-0.357197,0.444318,Negative,0.0
