#basic sentiment analysis using TextBlob Library

In [None]:
#importTextBlob class
from textblob import TextBlob
sentence="but you are Late Flight again!! Again and Again! Where are the crew?"


In [None]:
#create an object of the TextBlob class and view the blob object
blob=TextBlob(sentence)
blob


TextBlob("but you are Late Flight again!! Again and Again! Where are the crew?")

In [None]:
#sentiment property of the TextBlob class, which returns a tuple
blob.sentiment


Sentiment(polarity=-0.5859375, subjectivity=0.6)

#TextBlob

In [None]:
#tweet sentiment analysis using the TextBlob Library
#import all necessary libraries
import pandas as pd
from textblob import TextBlob
import re


In [None]:
#displaying the text in the notebook, we want to increase the display width for the DataFrame
pd.set_option('display.max_colwidth',240)


In [None]:
#load the Tweets dataset using read_csv() function
TWEET_FILE='Tweets.csv'
tweets=pd.read_csv(TWEET_FILE,usecols=['text'])
tweets.columns=['Tweet']


In [None]:
#view the first 10 records of the DataFrame
tweets.head(10)


Unnamed: 0,Tweet
0,@VirginAmerica What @dhepburn said.
1,@VirginAmerica plus you've added commercials to the experience... tacky.
2,@VirginAmerica I didn't today... Must mean I need to take another trip!
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse"
4,@VirginAmerica and it's a really big bad thing about it
5,@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA
6,"@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)"
7,"@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP"
8,"@virginamerica Well, I didn't…but NOW I DO! :-D"
9,"@VirginAmerica it was amazing, and arrived an hour early. You're too good to me."


In [None]:
#tweets contain twitter handles, which start with the @ symbol. The string column included in the DataFrame has an extract() function, Which uses a regex to get parts of a string.
#this code declares a new column called At and sets the value to what the extract function returns. The extract function uses a regex, ^(@\S+), to return strings that start with @. 
tweets['At']=tweets.Tweet.str.extract(r'^(@\S+)')


In [None]:
#view the 10 records along with twitter handles
tweets.head(10)


Unnamed: 0,Tweet,At
0,@VirginAmerica What @dhepburn said.,@VirginAmerica
1,@VirginAmerica plus you've added commercials to the experience... tacky.,@VirginAmerica
2,@VirginAmerica I didn't today... Must mean I need to take another trip!,@VirginAmerica
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",@VirginAmerica
4,@VirginAmerica and it's a really big bad thing about it,@VirginAmerica
5,@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA,@VirginAmerica
6,"@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)",@VirginAmerica
7,"@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP",@VirginAmerica
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",@virginamerica
9,"@VirginAmerica it was amazing, and arrived an hour early. You're too good to me.",@VirginAmerica


In [None]:
#remove the Twitter handles since they are irrelevant for sentiment analysis. remove_handles(), which accepts a DataFrame as a parameter. After passing the DataFrame, the re.sub() function will remove the handles in the DataFrame.
def remove_handles(tweet):
  return re.sub(r'@\S+','',tweet)



In [None]:
#remove the handles
tweets['Tweet']=tweets.Tweet.apply(remove_handles)
tweets.head(10)


Unnamed: 0,Tweet,At
0,What said.,@VirginAmerica
1,plus you've added commercials to the experience... tacky.,@VirginAmerica
2,I didn't today... Must mean I need to take another trip!,@VirginAmerica
3,"it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",@VirginAmerica
4,and it's a really big bad thing about it,@VirginAmerica
5,seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA,@VirginAmerica
6,"yes, nearly every time I fly VX this “ear worm” won’t go away :)",@VirginAmerica
7,"Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP",@VirginAmerica
8,"Well, I didn't…but NOW I DO! :-D",@virginamerica
9,"it was amazing, and arrived an hour early. You're too good to me.",@VirginAmerica


In [None]:
#get_sentiment() function, which accepts a DataFrame and a column as parameters. Using this function, we create two new columns Polarity and Subjectivity which will show the sentiment scores of each tweet.
def get_sentiment(dataframe,column):
  text_column=dataframe[column]
  textblob_sentiment=text_column.apply(TextBlob)
  sentiment_values=[{'Polarity':v.sentiment.polarity,'Subjectivity':v.sentiment.subjectivity} for v in textblob_sentiment.values]
  return pd.DataFrame(sentiment_values)

sentiment_frame=get_sentiment(tweets,'Tweet')

sentiment_frame.head(4)


Unnamed: 0,Polarity,Subjectivity
0,0.0,0.0
1,0.0,0.0
2,-0.390625,0.6875
3,0.00625,0.35


In [None]:
#join the original tweet DataFrame to the  sentiment_frameDataFrame, we use the concat() function
tweets=pd.concat([tweets,sentiment_frame],axis=1)



In [None]:
#display 10 rows
tweets.head(10)


Unnamed: 0,Tweet,At,Polarity,Subjectivity
0,@VirginAmerica What @dhepburn said.,@VirginAmerica,0.0,0.0
1,@VirginAmerica plus you've added commercials to the experience... tacky.,@VirginAmerica,0.0,0.0
2,@VirginAmerica I didn't today... Must mean I need to take another trip!,@VirginAmerica,-0.390625,0.6875
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",@VirginAmerica,0.00625,0.35
4,@VirginAmerica and it's a really big bad thing about it,@VirginAmerica,-0.35,0.383333
5,@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA,@VirginAmerica,-0.208333,0.633333
6,"@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)",@VirginAmerica,0.466667,0.766667
7,"@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP",@VirginAmerica,0.2,0.2
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",@virginamerica,1.0,1.0
9,"@VirginAmerica it was amazing, and arrived an hour early. You're too good to me.",@VirginAmerica,0.466667,0.6


In [None]:
#polarity scores greater than 0.5 as positive, less than or equal to -0.5 as negative and neutral as -0.1 and 0.1
positive_tweets=tweets[tweets.Polarity>0.5]
negative_tweets=tweets[tweets.Polarity<=-0.5]
neutral_tweets=tweets[(tweets.Polarity>-0.1)&(tweets.Polarity<0.1)]

positive_tweets



Unnamed: 0,Tweet,At,Polarity,Subjectivity
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",@virginamerica,1.000000,1.000000
19,@VirginAmerica you know what would be amazingly awesome? BOS-FLL PLEASE!!!!!!! I want to fly with only you.,@VirginAmerica,0.600000,0.966667
22,@VirginAmerica I love the hipster innovation. You are a feel good brand.,@VirginAmerica,0.600000,0.600000
34,@VirginAmerica this is great news! America could start flights to Hawaii by end of year http://t.co/r8p2Zy3fe4 via @Pacificbiznews,@VirginAmerica,1.000000,0.750000
35,Nice RT @VirginAmerica: Vibe with the moodlight from takeoff to touchdown. #MoodlitMonday #ScienceBehindTheExperience http://t.co/Y7O0uNxTQP,,0.600000,1.000000
...,...,...,...,...
14566,"@AmericanAir i was also told by agents my issues ""aren't their prob"" K fine. I get it. But have some compassion 4 others dealing w/this!!!",@AmericanAir,0.813802,0.500000
14577,@AmericanAir I have never on all my trips on any airline ever nat'l or int'l ever experienced anything like this!,@AmericanAir,1.000000,0.900000
14587,@AmericanAir I’ll play it by ear. I know that you are doing your best. Buy some chewey oatmeal cookies for your customer care folks.,@AmericanAir,1.000000,0.300000
14625,@AmericanAir Flight 236 was great. Fantastic cabin crew. A+ landing. #thankyou #JFK http://t.co/dRW08djHAI,@AmericanAir,0.600000,0.825000


In [None]:
negative_tweets

Unnamed: 0,Tweet,At,Polarity,Subjectivity
33,"@VirginAmerica awaiting my return phone call, just would prefer to use your online self-service option :(",@VirginAmerica,-0.750000,1.000000
84,@VirginAmerica it was a disappointing experience which will be shared with every business traveler I meet. #neverflyvirgin,@VirginAmerica,-0.600000,0.700000
87,@VirginAmerica Random Q: what's the distribution of elevate avatars? I bet that kitty has a disproportionate share http://t.co/APtZpuROp4,@VirginAmerica,-0.500000,0.500000
97,@VirginAmerica - Let 2 scanned in passengers leave the plane than told someone to remove their bag from 1st class bin? #uncomfortable,@VirginAmerica,-0.500000,1.000000
99,@VirginAmerica is anyone doing anything there today? Website is useless and no one is answering the phone.,@VirginAmerica,-0.500000,0.200000
...,...,...,...,...
14483,@AmericanAir The bad weather wasn't a surprise! You should have double/triple staff on hand to handle calls. Way to treat your customers.,@AmericanAir,-0.875000,0.666667
14507,"@AmericanAir i dont believe it, it has been impossible for your agents to get an update from the delivery company since yesterday at 11 am",@AmericanAir,-0.666667,1.000000
14535,@AmericanAir Flight Cancelled Flighted and rebooked but agent made a mistake and booked wrong date! Been trying to get through via phone for hours!!,@AmericanAir,-0.976562,0.900000
14568,@AmericanAir i was spoken 2 like I'm an idiot and that is not OK!! I don't need to deal w/ that esp after the travel experience I've had,@AmericanAir,-0.595313,0.650000


In [None]:
neutral_tweets


Unnamed: 0,Tweet,At,Polarity,Subjectivity
0,@VirginAmerica What @dhepburn said.,@VirginAmerica,0.00000,0.00
1,@VirginAmerica plus you've added commercials to the experience... tacky.,@VirginAmerica,0.00000,0.00
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",@VirginAmerica,0.00625,0.35
10,@VirginAmerica did you know that suicide is the second leading cause of death among teens 10-24,@VirginAmerica,0.00000,0.00
15,@VirginAmerica SFO-PDX schedule is still MIA.,@VirginAmerica,0.00000,0.00
...,...,...,...,...
14626,@AmericanAir Flight 953 NYC-Buenos Aires has been delay since yesterday at 10PM. Is going to take off at 3.30PM now? Give us answers!,@AmericanAir,0.00000,0.00
14628,"Thank you. “@AmericanAir: @jlhalldc Customer Relations will review your concerns and contact you back directly, John.”",,0.05000,0.20
14633,"@AmericanAir my flight was Cancelled Flightled, leaving tomorrow morning. Auto rebooked for a Tuesday night flight but need to arrive Monday.",@AmericanAir,0.00000,0.00
14635,@AmericanAir thank you we got on a different flight to Chicago.,@AmericanAir,0.00000,0.60


#Understanding Data for Sentiment Analysis

In [None]:
#Loading Data for Sentiment Analysis
#We will load data that could be used to train a sentiment analysis model.
# We will be using three datasets namely Amazon, Yelp and IMDB


In [None]:
#import necessary libraries
#Import necessary libraries and also sets the display width to 200 characters 
#so that more of the review text is displayed on the screen.

import pandas as pd
pd.set_option('display.max_colwidth',200)



In [None]:
#load the three different datasets
DATA_DIR='/content/'
IMDB_DATA_FILE=DATA_DIR+'imdb_labelled.txt'
YELP_DATA_FILE=DATA_DIR+'yelp_labelled.txt'
AMAZON_DATA_FILE=DATA_DIR+'amazon_cells_labelled.txt'
COLUMN_NAMES=['Review','Sentiment']


In [None]:
#load the IMDB reviews
imdb_reviews=pd.read_table(IMDB_DATA_FILE,names=COLUMN_NAMES)
imdb_reviews.head(10)

#negative reviews have sentiment scores of 0 and positive reviews have the score of 1


Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man.",0
1,"Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.",0
2,"Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.",0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.,1
5,"The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty.",0
6,Wasted two hours.,0
7,"Saw the movie today and thought it was a good effort, good messages for kids.",1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the science teacher.,1


In [None]:
#negative reviews have sentiment scores of 0 and positive reviews have the score of 1

In [None]:
#check the total records of the IMDB review file by using value_counts() function
imdb_reviews.Sentiment.value_counts()


1    386
0    362
Name: Sentiment, dtype: int64

In [None]:
#count the positive and negative sentiments
imdb_counts=imdb_reviews.Sentiment.value_counts().to_frame()
imdb_counts.index=pd.Series(['Positive','Negative'])
imdb_counts


Unnamed: 0,Sentiment
Positive,386
Negative,362


In [None]:
#load the amazon and yelp reviews
amazon_reviews=pd.read_table(AMAZON_DATA_FILE,names=COLUMN_NAMES)
amazon_reviews.head(10)



Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here in the US unless I go by a converter.,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up right to get decent volume.,0
6,"If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.",0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


In [None]:
yelp_reviews=pd.read_table(YELP_DATA_FILE,names=COLUMN_NAMES)
yelp_reviews.head(10)


Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1
4,The selection on the menu was great and so were the prices.,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.,0
8,The fries were great too.,1
9,A great touch.,1


#Training Sentiment Models

In [None]:
#import necessary libraries
import pandas as pd
pd.set_option('display.max_colwidth',200)



In [None]:
#load all three datasets
DATA_DIR='/content/'
IMDB_DATA_FILE=DATA_DIR+'imdb_labelled.txt'
YELP_DATA_FILE=DATA_DIR+'yelp_labelled.txt'
AMAZON_DATA_FILE=DATA_DIR+'amazon_cells_labelled.txt'
COLUMN_NAMES=['Review','Sentiment']
imdb_reviews=pd.read_table(IMDB_DATA_FILE,names=COLUMN_NAMES)
amazon_reviews=pd.read_table(AMAZON_DATA_FILE,names=COLUMN_NAMES)
yelp_reviews=pd.read_table(YELP_DATA_FILE,names=COLUMN_NAMES)


In [None]:
#concatenate the different datasets into one dataset using the concat() function
review_data=pd.concat([amazon_reviews,imdb_reviews,yelp_reviews])



In [None]:
#we combined the data from three separate files, sample() function returns a random selection from the dataset. This will allow to see the reviews from different files.
review_data.sample(10)



Unnamed: 0,Review,Sentiment
792,Great Phone.,1
566,This movie is also revealing.,1
286,I wouldn't recommend buying this product.,0
650,"As for the killer, don't expect anything original or even remotely frightening.",0
837,I have never had such bland food which surprised me considering the article we read focused so much on their spices and flavor.,0
272,Unreliable - I'm giving up.,0
525,REALLY UGLY.,0
95,Will order from them again!,1
105,"This is a bad film, with bad writing, and good actors....an ugly cartoon crafted by Paul Haggis for people who can't handle anything but the bold strokes in storytelling....a picture painted with ...",0
538,That was done in the second movie.,0


In [None]:
#do the pre-processing using clean() function
import re
def clean(text):
  text=re.sub(r'[\W]+',' ',text.lower())
  text=text.replace('hadn t','had not').replace('wasn t','was not').replace('didn t','did not')
  return text


In [None]:
#once the function is defined, we can clean and tokenize
review_model_data=review_data.copy()
review_model_data.Review=review_data.Review.apply(clean)

review_model_data.sample(10)



Unnamed: 0,Review,Sentiment
194,not impressed,0
348,5 stars for the brick oven bread app,1
525,similarly the delivery man did not say a word of apology when our food was 45 minutes late,0
218,worst phone ever,0
942,very slow at seating even with reservation,0
501,this battery is an excellent bargain,1
529,razr battery good buy,1
812,this place is overpriced not consistent with their boba and it really is overpriced,0
640,end of days is one of the worst big budget action movies i ve ever seen,0
827,for that price i can think of a few place i would have much rather gone,0


In [None]:
''''develop the model we use TfidfVectorizer to convert each review into TFIDF vector. 
It will capture the relationship between the words in a review’s text and their presence in the entire dataset. 
Logistic Regression is a machine learning algorithms that is used to train sentiment classification models.'''


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [None]:
#combineTfidfVectorizer and LogisticRegression in a pipeline object, 
#which is a way to run a series of algorithms in a single pipeline

tfidf=TfidfVectorizer(strip_accents=None,preprocessor=None,lowercase=False)
log_reg=LogisticRegression(random_state=0,solver='lbfgs')
log_tfidf=Pipeline([('vect',tfidf),('clf',log_reg)])


In [None]:
#split the train and test sets, split 70% for training and 30% for testing by using train_test_split() function
X_train,X_test,y_train,y_test=train_test_split(review_model_data.Review,review_model_data.Sentiment,test_size=0.3,random_state=42)


In [None]:
#fit the training data to the training pipeline by using fit() function
log_tfidf.fit(X_train.values,y_train.values)



Pipeline(steps=[('vect', TfidfVectorizer(lowercase=False)),
                ('clf', LogisticRegression(random_state=0))])

In [None]:
#check our model’s accuracy by using score() function
test_accuracy=log_tfidf.score(X_test.values,y_test.values)
'the model has a test accuracy of {:0%}'.format(test_accuracy)


'the model has a test accuracy of 81.090909%'

In [None]:
#to predict the sentiment of sentences by using predict() function and 
#it returns a score of 1 for positive test sentence and returns score of 0 for negative test sentence.
log_tfidf.predict(['I loved this place','I hated this place'])



array([1, 0])