## Sentiment Analysis Project for Twitter
_____

# Twitter Spark Streaming
Set up the credentials for a twitter app at https://apps.twitter.com/
    

In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
# May cause deprecation warnings, safe to ignore, they aren't errors
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc

In [4]:
# Can only run this once. restart your kernel for any errors.
sc = SparkContext()

In [5]:
ssc = StreamingContext(sc, 10 )
sqlContext = SQLContext(sc)

In [6]:
#creating a sockets for the communication with spark
socket_stream = ssc.socketTextStream("127.0.0.1", 9090)

In [7]:
lines = socket_stream.window( 20 ) # the window seconds we have to wait

In [8]:
# cleaning the data coming a with a line and searching for a specific word
( lines.flatMap( lambda text: text.split( "\n" )).filter( lambda word: '*' in word ).foreachRDD( lambda rdd: rdd.toDF() ) ) #Lower cases the word
#words.saveAsTextFiles('tempFile.txt')

### This part is what is going to make a connection with the server 


In [9]:
# vaderSentiment is sentiment analysis function to analyze the tweets
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [10]:
import time
from IPython import display
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
import socket
import json
import string
import re
%matplotlib inline 

In [12]:
#start the streaming
ssc.start()

In [13]:
#stop the streaming
ssc.stop()

In [14]:
# writting a file with all the twiting all the data form twitter
tweets_data = []
# name of file
OUTPUT_FILE = "tempFile.txt"
# readind file. and saving it to tweets list
with open(OUTPUT_FILE, "r") as tweets_file:
    for line in tweets_file:
        tweet = json.loads(line)
        tweets_data.append(tweet)

In [15]:
#creating dataframe
dfTwitter = pd.DataFrame(tweets_data, columns=['created_at','langu', 'text', 'source'])
dfTwitter.head()

Unnamed: 0,created_at,langu,text,source
0,Wed Nov 04 06:33:31 +0000 2020,,RT @DavidCornDC: If I have this right....\n* P...,"<a href=""http://twitter.com/download/android"" ..."
1,Wed Nov 04 06:33:32 +0000 2020,,@singtography * i stan alex,"<a href=""https://mobile.twitter.com"" rel=""nofo..."
2,Wed Nov 04 06:33:32 +0000 2020,,RT @anj3llyfish: this is misleading; of the ba...,"<a href=""http://twitter.com/download/iphone"" r..."
3,Wed Nov 04 06:33:35 +0000 2020,,RT @Ninja_Kane: *Cough * America is the joke o...,"<a href=""http://twitter.com/download/iphone"" r..."
4,Wed Nov 04 06:33:37 +0000 2020,,which automatically makes me the best.\n\n( a ...,"<a href=""http://twitter.com/download/iphone"" r..."


In [16]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    #text = re.sub('[0-9]+', '', text)
    return text

In [17]:
dfTwitter['text'] = dfTwitter['text'].apply(lambda x: remove_punct(x))
dfTwitter.head(10)

Unnamed: 0,created_at,langu,text,source
0,Wed Nov 04 06:33:31 +0000 2020,,RT DavidCornDC If I have this right\n Philly n...,"<a href=""http://twitter.com/download/android"" ..."
1,Wed Nov 04 06:33:32 +0000 2020,,singtography i stan alex,"<a href=""https://mobile.twitter.com"" rel=""nofo..."
2,Wed Nov 04 06:33:32 +0000 2020,,RT anj3llyfish this is misleading of the ballo...,"<a href=""http://twitter.com/download/iphone"" r..."
3,Wed Nov 04 06:33:35 +0000 2020,,RT NinjaKane Cough America is the joke of the...,"<a href=""http://twitter.com/download/iphone"" r..."
4,Wed Nov 04 06:33:37 +0000 2020,,which automatically makes me the best\n\n a hi...,"<a href=""http://twitter.com/download/iphone"" r..."
5,Wed Nov 04 06:33:39 +0000 2020,,RT NateSilver538 So far Dems Senate hopes\n\n ...,"<a href=""http://twitter.com/download/iphone"" r..."
6,Wed Nov 04 06:33:42 +0000 2020,,epuppie Teleports to dm httpstcoQFZuX2hhWS,"<a href=""http://twitter.com/download/iphone"" r..."
7,Wed Nov 04 06:33:43 +0000 2020,,RT LifeNewsHQ BREAKING \n\nJoe Biden claims he...,"<a href=""https://mobile.twitter.com"" rel=""nofo..."
8,Wed Nov 04 06:33:43 +0000 2020,,attention to all domsswitches,"<a href=""http://twitter.com/download/iphone"" r..."
9,Wed Nov 04 06:33:43 +0000 2020,,RT overkilss rt dm forbtsbotonly,"<a href=""http://twitter.com/download/android"" ..."


In [18]:
#if we want to check all the tweets
for i in dfTwitter['text']:
    print(i)

RT DavidCornDC If I have this right
 Philly not fully reporting until after 930 am And 22m mailin ballots to count in PA
 Mich…
singtography  i stan alex
RT anj3llyfish this is misleading of the ballots that were rejected 23 of those were rejected due to missing signatures nonetheless…
RT NinjaKane Cough  America is the joke of the world cough
which automatically makes me the best

 a hint of a smile 
RT NateSilver538 So far Dems Senate hopes

 At least one GA runoff possibly two
 Bullock has a shot
 Gideon an underdog IMO but s…
epuppie  Teleports to dm  httpstcoQFZuX2hhWS
RT LifeNewsHQ BREAKING 

Joe Biden claims hes on track to win but Trump is leading in the key battleground states left

 5642 lead…
attention to all domsswitches
RT overkilss rt  dm forbtsbotonly
RT VerronHaynes Correction  359 justicehaynes6 coach just call me  Big Congratulations salute you and your Teamate beat a great tea…
RT NateSilver538 What we know so far

 Trump looks good in Florida
 Beyond that not much I

In [19]:
analyzer = SentimentIntensityAnalyzer()

In [20]:
dfTwitter.shape

(501, 4)

In [21]:
# This function is where I perform the sentiment analysis on each tweets form the Spark 
num = 0
myList = []
for sentence in dfTwitter['text']:
    print( num, " " , analyzer.polarity_scores(sentence))
    myList.append(analyzer.polarity_scores(sentence))
    num +=1

0   {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
1   {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
2   {'neg': 0.404, 'neu': 0.596, 'pos': 0.0, 'compound': -0.8885}
3   {'neg': 0.0, 'neu': 0.82, 'pos': 0.18, 'compound': 0.296}
4   {'neg': 0.0, 'neu': 0.573, 'pos': 0.427, 'compound': 0.7717}
5   {'neg': 0.0, 'neu': 0.924, 'pos': 0.076, 'compound': 0.2263}
6   {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
7   {'neg': 0.132, 'neu': 0.779, 'pos': 0.089, 'compound': -0.2846}
8   {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
9   {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
10   {'neg': 0.0, 'neu': 0.692, 'pos': 0.308, 'compound': 0.8402}
11   {'neg': 0.0, 'neu': 0.879, 'pos': 0.121, 'compound': 0.4404}
12   {'neg': 0.0, 'neu': 0.882, 'pos': 0.118, 'compound': 0.4466}
13   {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
14   {'neg': 0.246, 'neu': 0.754, 'pos': 0.0, 'compound': -0.705}
15   {'neg': 0.158, 'neu': 0.753, 'pos': 0.089, 'compound': -

In [22]:
len(myList)

501

In [23]:
dfTwitter["Result"] =  myList

In [24]:
dfTwitter.dtypes

created_at     object
langu         float64
text           object
source         object
Result         object
dtype: object

In [25]:
mydict = dict(dfTwitter["Result"])

In [26]:
dfTwitter["Analysis Result"] = 0

In [27]:
for x in mydict:
    if (mydict[x]['neg'] > mydict[x]['neu'] and mydict[x]['neg'] >mydict[x]['pos'] and mydict[x]['neg'] >mydict[x]['compound']):
        print (dfTwitter["text"].iloc[x])
        print("----Tweet is negative, according to the score", mydict[x]['neg'])
        print("***", mydict[x], "***\n")
        dfTwitter["Analysis Result"].iloc[x]= "negative"
    elif (mydict[x]['neu'] > mydict[x]['neg'] and mydict[x]['neu'] >mydict[x]['pos'] and mydict[x]['neu'] >mydict[x]['compound']):
        print (dfTwitter["text"].iloc[x])
        print("----Tweet is neutral, according to the score", mydict[x]['neu'])
        print("***", mydict[x], "***\n")
        dfTwitter["Analysis Result"].iloc[x]= "neutral"
    elif (mydict[x]['pos'] > mydict[x]['neg'] and mydict[x]['pos'] >mydict[x]['neu'] and mydict[x]['pos'] >mydict[x]['compound']):
        print (dfTwitter["text"].iloc[x])
        print("----Tweet is positive, according to the score", mydict[x]['pos'])
        print("***", mydict[x], "***\n")
        dfTwitter["Analysis Result"].iloc[x]= "positive"
    elif (mydict[x]['compound'] > mydict[x]['neg'] and mydict[x]['compound'] >mydict[x]['neu'] and mydict[x]['compound'] >mydict[x]['pos']):
        print (dfTwitter["text"].iloc[x])
        print("----Tweet is Compound, according to the score", mydict[x]['compound'])
        print("***", mydict[x], "***\n")
        dfTwitter["Analysis Result"].iloc[x]= "Compound"
    else :
        print("****something went wrong****")

RT DavidCornDC If I have this right
 Philly not fully reporting until after 930 am And 22m mailin ballots to count in PA
 Mich…
----Tweet is neutral, according to the score 1.0
*** {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ***

singtography  i stan alex
----Tweet is neutral, according to the score 1.0
*** {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ***

RT anj3llyfish this is misleading of the ballots that were rejected 23 of those were rejected due to missing signatures nonetheless…
----Tweet is neutral, according to the score 0.596
*** {'neg': 0.404, 'neu': 0.596, 'pos': 0.0, 'compound': -0.8885} ***

RT NinjaKane Cough  America is the joke of the world cough
----Tweet is neutral, according to the score 0.82
*** {'neg': 0.0, 'neu': 0.82, 'pos': 0.18, 'compound': 0.296} ***

which automatically makes me the best

 a hint of a smile 
----Tweet is Compound, according to the score 0.7717
*** {'neg': 0.0, 'neu': 0.573, 'pos': 0.427, 'compound': 0.7717} ***

RT NateSi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


----Tweet is neutral, according to the score 1.0
*** {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ***

whenthelakerson  at all
----Tweet is neutral, according to the score 1.0
*** {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ***

    

He facepalms after saying that

I dontknow
----Tweet is neutral, according to the score 1.0
*** {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ***

RT RetroToySearch Batman Gotham City Chronicles  Grundboxen Inklusive Streachgoals  Kickstarter eBay Germany Auction RetroToys R…
----Tweet is neutral, according to the score 1.0
*** {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ***

Always calls emergency meeting 
----Tweet is neutral, according to the score 0.536
*** {'neg': 0.464, 'neu': 0.536, 'pos': 0.0, 'compound': -0.3818} ***

IamSectLeader  he was shocked by the force but  wow it was nice 
ahhh wow Aling you got strong 

 he… httpstcobbN0WhU0G7
----Tweet is Compound, according to the score 0.9633
*** {'neg': 0.047, 'n

In [28]:
dfTwitter.head()

Unnamed: 0,created_at,langu,text,source,Result,Analysis Result
0,Wed Nov 04 06:33:31 +0000 2020,,RT DavidCornDC If I have this right\n Philly n...,"<a href=""http://twitter.com/download/android"" ...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral
1,Wed Nov 04 06:33:32 +0000 2020,,singtography i stan alex,"<a href=""https://mobile.twitter.com"" rel=""nofo...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral
2,Wed Nov 04 06:33:32 +0000 2020,,RT anj3llyfish this is misleading of the ballo...,"<a href=""http://twitter.com/download/iphone"" r...","{'neg': 0.404, 'neu': 0.596, 'pos': 0.0, 'comp...",neutral
3,Wed Nov 04 06:33:35 +0000 2020,,RT NinjaKane Cough America is the joke of the...,"<a href=""http://twitter.com/download/iphone"" r...","{'neg': 0.0, 'neu': 0.82, 'pos': 0.18, 'compou...",neutral
4,Wed Nov 04 06:33:37 +0000 2020,,which automatically makes me the best\n\n a hi...,"<a href=""http://twitter.com/download/iphone"" r...","{'neg': 0.0, 'neu': 0.573, 'pos': 0.427, 'comp...",Compound


In [29]:
import json
from pymongo import MongoClient

In [30]:
# how to connect to the mongodb instance on the computer with port and addr
conn = MongoClient('localhost', 27017)

In [31]:
# looking for databases on my mongodb
cursor = conn.list_databases()
for db in cursor:
    print(db)

{'name': 'CarsDB', 'sizeOnDisk': 81920.0, 'empty': False}
{'name': 'Classproject', 'sizeOnDisk': 8192.0, 'empty': False}
{'name': 'Testing_DB', 'sizeOnDisk': 8192.0, 'empty': False}
{'name': 'admin', 'sizeOnDisk': 40960.0, 'empty': False}
{'name': 'config', 'sizeOnDisk': 110592.0, 'empty': False}
{'name': 'local', 'sizeOnDisk': 73728.0, 'empty': False}


In [32]:
# connecting to the database 
db = conn.Classproject

In [33]:
# looking for the collecction on this database
print(db.list_collection_names(include_system_collections=False) )

['Tweets']


In [34]:
# showing all the columns names
dfTwitter.columns

Index(['created_at', 'langu', 'text', 'source', 'Result', 'Analysis Result'], dtype='object')

In [35]:
# Inserting data into the mongodb database
data_dict= dfTwitter.to_dict("records")
db.Tweets.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x1d86292e480>

In [36]:
# presenting the data from mongodb using the result from neutral evaluation. 
for x in db.Tweets.find({},{"_id":0,"text": 1, "Analysis Result":"neutral"}):
    print(x)

{'text': 'RT DavidCornDC If I have this right\n Philly not fully reporting until after 930 am And 22m mailin ballots to count in PA\n Mich…', 'Analysis Result': 'neutral'}
{'text': 'singtography  i stan alex', 'Analysis Result': 'neutral'}
{'text': 'RT anj3llyfish this is misleading of the ballots that were rejected 23 of those were rejected due to missing signatures nonetheless…', 'Analysis Result': 'neutral'}
{'text': 'RT NinjaKane Cough  America is the joke of the world cough', 'Analysis Result': 'neutral'}
{'text': 'which automatically makes me the best\n\n a hint of a smile ', 'Analysis Result': 'neutral'}
{'text': 'RT NateSilver538 So far Dems Senate hopes\n\n At least one GA runoff possibly two\n Bullock has a shot\n Gideon an underdog IMO but s…', 'Analysis Result': 'neutral'}
{'text': 'epuppie  Teleports to dm  httpstcoQFZuX2hhWS', 'Analysis Result': 'neutral'}
{'text': 'RT LifeNewsHQ BREAKING \n\nJoe Biden claims hes on track to win but Trump is leading in the key battlegroun

In [37]:
from bson.son import SON

In [38]:
pipeline = [
    {"$unwind": "$Analysis Result"},
    {"$group": {"_id": "$Analysis Result", "count": {"$sum": 1}}},
    {"$sort": SON([("count", -1), ("_id", -1)])}
]

In [39]:
import pprint
pprint.pprint(list(db.Tweets.aggregate(pipeline)))

[{'_id': 'neutral', 'count': 440},
 {'_id': 'Compound', 'count': 47},
 {'_id': 'negative', 'count': 9},
 {'_id': 'positive', 'count': 5}]


In [40]:
for x in db.Tweets.find({},{"_id":0,"text": 1, "Analysis Result":"negative"}):
    print(x)

{'text': 'RT DavidCornDC If I have this right\n Philly not fully reporting until after 930 am And 22m mailin ballots to count in PA\n Mich…', 'Analysis Result': 'negative'}
{'text': 'singtography  i stan alex', 'Analysis Result': 'negative'}
{'text': 'RT anj3llyfish this is misleading of the ballots that were rejected 23 of those were rejected due to missing signatures nonetheless…', 'Analysis Result': 'negative'}
{'text': 'RT NinjaKane Cough  America is the joke of the world cough', 'Analysis Result': 'negative'}
{'text': 'which automatically makes me the best\n\n a hint of a smile ', 'Analysis Result': 'negative'}
{'text': 'RT NateSilver538 So far Dems Senate hopes\n\n At least one GA runoff possibly two\n Bullock has a shot\n Gideon an underdog IMO but s…', 'Analysis Result': 'negative'}
{'text': 'epuppie  Teleports to dm  httpstcoQFZuX2hhWS', 'Analysis Result': 'negative'}
{'text': 'RT LifeNewsHQ BREAKING \n\nJoe Biden claims hes on track to win but Trump is leading in the key batt