## Sentiment Analysis Project for Twitter
_____

# Twitter Spark Streaming
Set up the credentials for a twitter app at https://apps.twitter.com/
    

In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
# May cause deprecation warnings, safe to ignore, they aren't errors
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc

In [4]:
# Can only run this once. restart your kernel for any errors.
sc = SparkContext()

In [5]:
ssc = StreamingContext(sc, 10 )
sqlContext = SQLContext(sc)

In [6]:
#creating a sockets for the communication with spark
socket_stream = ssc.socketTextStream("192.168.56.1", 8081)

In [7]:
lines = socket_stream.window( 20 ) # the window seconds we have to wait

In [8]:
# cleaning the data coming a with a line and searching for a specific word
( lines.flatMap( lambda text: text.split( "\n" )).filter( lambda word: '*' in word ).foreachRDD( lambda rdd: rdd.toDF() ) ) #Lower cases the word
#words.saveAsTextFiles('tempFile.txt')

### This part is what is going to make a connection with the server 


In [9]:
# vaderSentiment is sentiment analysis function to analyze the tweets
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [10]:
import time
from IPython import display
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
import socket
import json
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
%matplotlib inline 

In [11]:
#start the streaming
ssc.start()

#### Implement the next cell to stop the streaming but make sure to check the server code to see if you got the number of tweets that you need for you analysis. Note: If you want to implement the output using Spark SQL, you will need to implement the next cell

In [12]:
#stop the streaming
ssc.stop()

In [13]:
# writting a file with all the twiting all the data form twitter
tweets_data = []
# name of file
OUTPUT_FILE = "tempFile.txt"
# readind file. and saving it to tweets list
with open(OUTPUT_FILE, "r") as tweets_file:
    for line in tweets_file:
        tweet = json.loads(line)
        tweets_data.append(tweet)

In [14]:
#creating dataframe
dfTwitter = pd.DataFrame(tweets_data, columns=['user', 'created_at', 'text', 'source'])
dfTwitter.head()

Unnamed: 0,user,created_at,text,source
0,"{'id': 1331361706660999173, 'id_str': '1331361...",Fri Dec 04 01:49:54 +0000 2020,We take pride in our episodes because they sho...,"<a href=""http://twitter.com/download/android"" ..."
1,"{'id': 1171212556226244608, 'id_str': '1171212...",Fri Dec 04 01:49:55 +0000 2020,lee telling me ppl who are openly horny are we...,"<a href=""https://mobile.twitter.com"" rel=""nofo..."
2,"{'id': 1079799676163878912, 'id_str': '1079799...",Fri Dec 04 01:49:56 +0000 2020,Wow!!! @realDonaldTrump is such a LOSER!!! Put...,"<a href=""http://twitter.com/download/iphone"" r..."
3,"{'id': 1304602182071640065, 'id_str': '1304602...",Fri Dec 04 01:49:59 +0000 2020,RT @murray_nyc: @realDonaldTrump @dougducey De...,"<a href=""https://mobile.twitter.com"" rel=""nofo..."
4,"{'id': 1321204725623230464, 'id_str': '1321204...",Fri Dec 04 01:50:00 +0000 2020,RT @TCGCardSearch: Dark Charizard Holo 1st Edi...,


In [15]:
#extract hashtags

hashtags = []
for i in dfTwitter['text']:
    #print(i)
    hashtags.append(re.findall(r"#(\w+)", i))
    
dfTwitter['hashtags'] = hashtags

In [16]:
dfTwitter['hashtags'].describe()

count     5001
unique     339
top         []
freq      4162
Name: hashtags, dtype: object

In [17]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    return text

In [18]:
dfTwitter['text'] = dfTwitter['text'].apply(lambda x: remove_punct(x))
dfTwitter.head(10)

Unnamed: 0,user,created_at,text,source,hashtags
0,"{'id': 1331361706660999173, 'id_str': '1331361...",Fri Dec 04 01:49:54 +0000 2020,We take pride in our episodes because they sho...,"<a href=""http://twitter.com/download/android"" ...",[]
1,"{'id': 1171212556226244608, 'id_str': '1171212...",Fri Dec 04 01:49:55 +0000 2020,lee telling me ppl who are openly horny are we...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",[]
2,"{'id': 1079799676163878912, 'id_str': '1079799...",Fri Dec 04 01:49:56 +0000 2020,Wow realDonaldTrump is such a LOSER Putin’s Ru...,"<a href=""http://twitter.com/download/iphone"" r...",[]
3,"{'id': 1304602182071640065, 'id_str': '1304602...",Fri Dec 04 01:49:59 +0000 2020,RT murraynyc realDonaldTrump dougducey Dear Do...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",[]
4,"{'id': 1321204725623230464, 'id_str': '1321204...",Fri Dec 04 01:50:00 +0000 2020,RT TCGCardSearch Dark Charizard Holo 1st Editi...,,"[MINT, Pokemon, eBay, UnitedStates, Auction, T..."
5,"{'id': 787783268, 'id_str': '787783268', 'name...",Fri Dec 04 01:50:01 +0000 2020,♌️ As a scientist I respect Dr Hinshaw but fin...,"<a href=""http://twitter.com/#!/download/ipad"" ...",[]
6,"{'id': 173413459, 'id_str': '173413459', 'name...",Fri Dec 04 01:50:02 +0000 2020,MusicLov3rz WW Magazine April is here Featur...,"<a href=""https://www.hootsuite.com"" rel=""nofol...","[MusicLov3rz, WW, GeraldHarris, ExclusiveInter..."
7,"{'id': 2246381850, 'id_str': '2246381850', 'na...",Fri Dec 04 01:50:02 +0000 2020,LetsTalkToTheLord with Host Apostle JohnERo...,"<a href=""https://www.hootsuite.com"" rel=""nofol...","[LetsTalkToTheLord, Apostle, JohnERoss, Kellie..."
8,"{'id': 2248977601, 'id_str': '2248977601', 'na...",Fri Dec 04 01:50:02 +0000 2020,KevinRiley KevinRileymusic returns to MusicLo...,"<a href=""https://www.hootsuite.com"" rel=""nofol...","[KevinRiley, MusicLov3rz, FrontCover, Exclusiv..."
9,"{'id': 342188187, 'id_str': '342188187', 'name...",Fri Dec 04 01:50:03 +0000 2020,Hear our 25 minute podcast with host LifeCoac...,"<a href=""https://www.hootsuite.com"" rel=""nofol...","[podcast, LifeCoach]"


In [19]:
#Tag Retweets
retweets = []
for i in dfTwitter['text']:
    if 'RT' in i:
        retweets.append(1)
    else:
        retweets.append(0)
dfTwitter['Retweet'] = retweets

In [24]:
#cleaning the tweets
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)        
    return input_txt
def remove_newLine(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, ' ', input_txt)        
    return input_txt
def remove_mulSpaces(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, ' ', input_txt)    
    return input_txt
def remove_icons(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)    
    return input_txt
def remove_specialCha(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '.', input_txt)    
    return input_txt
def clean_tweets(tweets):
    tweets = np.vectorize(remove_icons)(tweets, "[^\x00-\x7F]+")
    #tweets = np.vectorize(remove_specialCha)(tweets, "\.+")
    #remove twitter Return handles (RT @xxx:)
    tweets = np.vectorize(remove_pattern)(tweets, "RT @[\w]*:") 
    
    #remove twitter handles (@xxx)
    tweets = np.vectorize(remove_pattern)(tweets, "@[\w]*")
    
    #remove URL links (httpxxx)
    tweets = np.vectorize(remove_pattern)(tweets, "https?://[A-Za-z0-9./]*")
    
    #remove special characters, numbers, punctuations (except for #)
    tweets = np.core.defchararray.replace(tweets, "[^a-zA-Z]", " ")
    tweets = np.core.defchararray.replace(tweets, "*", "")
    tweets = np.vectorize(remove_newLine)(tweets, "\n")
    tweets = np.vectorize(remove_mulSpaces)(tweets, " +")
    
    return tweets


In [25]:
dfTwitter['text'] = clean_tweets(dfTwitter['text'])
dfTwitter['text'].head()

0    We take pride in our episodes because they sho...
1    lee telling me ppl who are openly horny are we...
2    Wow realDonaldTrump is such a LOSER Putins Rus...
3    RT murraynyc realDonaldTrump dougducey Dear Do...
4    RT TCGCardSearch Dark Charizard Holo 1st Editi...
Name: text, dtype: object

In [26]:
#remove stopwords and convert create consistent format

stop_words = set(stopwords.words('english'))
stop_words.add('I')
nltk.download('punkt')
processed_text = []
wordlist = []
for i in dfTwitter['text']:
    filtered_sentence = []
    word_tokens = word_tokenize(i.lower())  

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    wordlist.append(filtered_sentence)
    processed_text.append(' '.join([str(elem) for elem in filtered_sentence]))
dfTwitter['clean_text'] = processed_text
dfTwitter['words'] = wordlist

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\15856\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
analyzer = SentimentIntensityAnalyzer()

In [28]:
dfTwitter.head()

Unnamed: 0,user,created_at,text,source,hashtags,Retweet,clean_text,words
0,"{'id': 1331361706660999173, 'id_str': '1331361...",Fri Dec 04 01:49:54 +0000 2020,We take pride in our episodes because they sho...,"<a href=""http://twitter.com/download/android"" ...",[],0,take pride episodes show real us dont edit epi...,"[take, pride, episodes, show, real, us, dont, ..."
1,"{'id': 1171212556226244608, 'id_str': '1171212...",Fri Dec 04 01:49:55 +0000 2020,lee telling me ppl who are openly horny are we...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",[],0,lee telling ppl openly horny weird like dont p...,"[lee, telling, ppl, openly, horny, weird, like..."
2,"{'id': 1079799676163878912, 'id_str': '1079799...",Fri Dec 04 01:49:56 +0000 2020,Wow realDonaldTrump is such a LOSER Putins Rus...,"<a href=""http://twitter.com/download/iphone"" r...",[],0,wow realdonaldtrump loser putins russian agent...,"[wow, realdonaldtrump, loser, putins, russian,..."
3,"{'id': 1304602182071640065, 'id_str': '1304602...",Fri Dec 04 01:49:59 +0000 2020,RT murraynyc realDonaldTrump dougducey Dear Do...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",[],1,rt murraynyc realdonaldtrump dougducey dear do...,"[rt, murraynyc, realdonaldtrump, dougducey, de..."
4,"{'id': 1321204725623230464, 'id_str': '1321204...",Fri Dec 04 01:50:00 +0000 2020,RT TCGCardSearch Dark Charizard Holo 1st Editi...,,"[MINT, Pokemon, eBay, UnitedStates, Auction, T...",1,rt tcgcardsearch dark charizard holo 1st editi...,"[rt, tcgcardsearch, dark, charizard, holo, 1st..."


In [29]:
# This function is where I perform the sentiment analysis on each tweets form the Spark 
num = 0
myList = []
for sentence in dfTwitter['clean_text']:
    #print( num, " " , analyzer.polarity_scores(sentence))
    myList.append(analyzer.polarity_scores(sentence))
    num +=1

In [30]:
len(myList)

5001

In [31]:
dfTwitter["Result"] =  myList

In [32]:
dfTwitter.dtypes

user          object
created_at    object
text          object
source        object
hashtags      object
Retweet        int64
clean_text    object
words         object
Result        object
dtype: object

In [33]:
mydict = dict(dfTwitter["Result"])

In [34]:
dfTwitter["Analysis Result"] = 0

In [35]:
# decide sentiment as positive, negative and neutral 
for i in mydict:
    if mydict[i]['compound'] >= 0.05 : 
#         print (dfTwitter["text"].iloc[i])
#         print("----Tweet is positive, according to the score", mydict[i]['pos'])
#         print("***", mydict[i], "***\n")
        dfTwitter["Analysis Result"].iloc[i]= "positive"
  
    elif mydict[i]['compound'] <= - 0.05 : 
#         print (dfTwitter["text"].iloc[i])
#         print("----Tweet is negative, according to the score", mydict[i]['neg'])
#         print("***", mydict[i], "***\n")
        dfTwitter["Analysis Result"].iloc[i]= "negative"
  
    else : 
#         print (dfTwitter["text"].iloc[i])
#         print("----Tweet is neutral, according to the score", mydict[i]['neu'])
#         print("***", mydict[i], "***\n")
        dfTwitter["Analysis Result"].iloc[i]= "neutral"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [36]:
dfTwitter = dfTwitter.explode('words')
dfTwitter = dfTwitter.explode('hashtags')

In [37]:
dfTwitter.reset_index(inplace=True)

In [38]:
dfTwitter.rename({'index':'TweetID'}, axis = 1)

Unnamed: 0,TweetID,user,created_at,text,source,hashtags,Retweet,clean_text,words,Result,Analysis Result
0,0,"{'id': 1331361706660999173, 'id_str': '1331361...",Fri Dec 04 01:49:54 +0000 2020,We take pride in our episodes because they sho...,"<a href=""http://twitter.com/download/android"" ...",,0,take pride episodes show real us dont edit epi...,take,"{'neg': 0.0, 'neu': 0.806, 'pos': 0.194, 'comp...",positive
1,0,"{'id': 1331361706660999173, 'id_str': '1331361...",Fri Dec 04 01:49:54 +0000 2020,We take pride in our episodes because they sho...,"<a href=""http://twitter.com/download/android"" ...",,0,take pride episodes show real us dont edit epi...,pride,"{'neg': 0.0, 'neu': 0.806, 'pos': 0.194, 'comp...",positive
2,0,"{'id': 1331361706660999173, 'id_str': '1331361...",Fri Dec 04 01:49:54 +0000 2020,We take pride in our episodes because they sho...,"<a href=""http://twitter.com/download/android"" ...",,0,take pride episodes show real us dont edit epi...,episodes,"{'neg': 0.0, 'neu': 0.806, 'pos': 0.194, 'comp...",positive
3,0,"{'id': 1331361706660999173, 'id_str': '1331361...",Fri Dec 04 01:49:54 +0000 2020,We take pride in our episodes because they sho...,"<a href=""http://twitter.com/download/android"" ...",,0,take pride episodes show real us dont edit epi...,show,"{'neg': 0.0, 'neu': 0.806, 'pos': 0.194, 'comp...",positive
4,0,"{'id': 1331361706660999173, 'id_str': '1331361...",Fri Dec 04 01:49:54 +0000 2020,We take pride in our episodes because they sho...,"<a href=""http://twitter.com/download/android"" ...",,0,take pride episodes show real us dont edit epi...,real,"{'neg': 0.0, 'neu': 0.806, 'pos': 0.194, 'comp...",positive
...,...,...,...,...,...,...,...,...,...,...,...
61281,5000,"{'id': 1211895835971051522, 'id_str': '1211895...",Fri Dec 04 03:28:43 +0000 2020,u bout dumb as shit sorry,"<a href=""http://twitter.com/download/iphone"" r...",,0,u bout dumb shit sorry,u,"{'neg': 0.804, 'neu': 0.196, 'pos': 0.0, 'comp...",negative
61282,5000,"{'id': 1211895835971051522, 'id_str': '1211895...",Fri Dec 04 03:28:43 +0000 2020,u bout dumb as shit sorry,"<a href=""http://twitter.com/download/iphone"" r...",,0,u bout dumb shit sorry,bout,"{'neg': 0.804, 'neu': 0.196, 'pos': 0.0, 'comp...",negative
61283,5000,"{'id': 1211895835971051522, 'id_str': '1211895...",Fri Dec 04 03:28:43 +0000 2020,u bout dumb as shit sorry,"<a href=""http://twitter.com/download/iphone"" r...",,0,u bout dumb shit sorry,dumb,"{'neg': 0.804, 'neu': 0.196, 'pos': 0.0, 'comp...",negative
61284,5000,"{'id': 1211895835971051522, 'id_str': '1211895...",Fri Dec 04 03:28:43 +0000 2020,u bout dumb as shit sorry,"<a href=""http://twitter.com/download/iphone"" r...",,0,u bout dumb shit sorry,shit,"{'neg': 0.804, 'neu': 0.196, 'pos': 0.0, 'comp...",negative


In [39]:
import json
from pymongo import MongoClient

In [40]:
# how to connect to the mongodb instance on the computer with port and addr
conn = MongoClient('localhost', 27017)

In [41]:
# looking for databases on my mongodb
cursor = conn.list_databases()
for db in cursor:
    print(db)

{'name': 'Classproject', 'sizeOnDisk': 47337472.0, 'empty': False}
{'name': 'admin', 'sizeOnDisk': 184320.0, 'empty': False}
{'name': 'config', 'sizeOnDisk': 36864.0, 'empty': False}
{'name': 'local', 'sizeOnDisk': 81920.0, 'empty': False}


In [42]:
# connecting to the database 
db = conn.ClassprojectFinal

In [43]:
# looking for the collecction on this database
#print(db.list_collection_names(include_system_collections=False) )

In [44]:
# showing all the columns names
dfTwitter.columns

Index(['index', 'user', 'created_at', 'text', 'source', 'hashtags', 'Retweet',
       'clean_text', 'words', 'Result', 'Analysis Result'],
      dtype='object')

In [45]:
# Inserting data into the mongodb database
data_dict= dfTwitter.to_dict("records")
db.Tweets.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x1c1a7d615c0>

In [46]:
# presenting the data from mongodb using the result from neutral evaluation. 
#for x in db.Tweets.find({},{"_id":0,"text": 1, "Analysis Result":"neutral"}):
#    print(x)

In [47]:
from bson.son import SON

In [48]:
pipeline = [
    {"$unwind": "$Analysis Result"},
    {"$group": {"_id": "$Analysis Result", "count": {"$sum": 1}}},
    {"$sort": SON([("count", -1), ("_id", -1)])}
]

In [49]:
import pprint
pprint.pprint(list(db.Tweets.aggregate(pipeline)))

[{'_id': 'neutral', 'count': 32227},
 {'_id': 'positive', 'count': 22279},
 {'_id': 'negative', 'count': 6780}]


In [50]:
#for x in db.Tweets.find({},{"_id":0,"text": 1, "Analysis Result":"negative"}):
#    print(x)