## Sentiment Analysis Project for Twitter
_____

# Twitter Spark Streaming
Set up the credentials for a twitter app at https://apps.twitter.com/
    

In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
# May cause deprecation warnings, safe to ignore, they aren't errors
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc

In [4]:
# Can only run this once. restart your kernel for any errors.
sc = SparkContext()

In [5]:
ssc = StreamingContext(sc, 10 )
sqlContext = SQLContext(sc)

In [6]:
#creating a sockets for the communication with spark
socket_stream = ssc.socketTextStream("192.168.56.1", 8080)

In [7]:
lines = socket_stream.window( 20 ) # the window seconds we have to wait

In [8]:
# cleaning the data coming a with a line and searching for a specific word
( lines.flatMap( lambda text: text.split( "\n" )).filter( lambda word: '*' in word ).foreachRDD( lambda rdd: rdd.toDF() ) ) #Lower cases the word
#words.saveAsTextFiles('tempFile.txt')

### This part is what is going to make a connection with the server 


In [9]:
# vaderSentiment is sentiment analysis function to analyze the tweets
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [10]:
import time
from IPython import display
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
import socket
import json
import string
import re
%matplotlib inline 

In [11]:
#start the streaming
ssc.start()

In [12]:
#stop the streaming
ssc.stop()

In [13]:
# writting a file with all the twiting all the data form twitter
tweets_data = []
# name of file
OUTPUT_FILE = "tempFile.txt"
# readind file. and saving it to tweets list
with open(OUTPUT_FILE, "r") as tweets_file:
    for line in tweets_file:
        tweet = json.loads(line)
        tweets_data.append(tweet)

In [14]:
#creating dataframe
dfTwitter = pd.DataFrame(tweets_data, columns=['user', 'created_at', 'text', 'source'])
dfTwitter.head()

Unnamed: 0,user,created_at,text,source
0,"{'id': 1210574876156813312, 'id_str': '1210574...",Fri Nov 27 19:23:01 +0000 2020,RT @free_lilfairy: the universe: \n. ★ ...,"<a href=""http://twitter.com/download/iphone"" r..."
1,"{'id': 1231777653859876865, 'id_str': '1231777...",Fri Nov 27 19:23:02 +0000 2020,"When looking at all vote batches, the Michigan...","<a href=""https://mobile.twitter.com"" rel=""nofo..."
2,"{'id': 1086725947074797573, 'id_str': '1086725...",Fri Nov 27 19:23:04 +0000 2020,RT @HOThits1005: 🚨 BLACK FRIDAY DEAL ALERT 🚨\n...,"<a href=""http://twitter.com/download/iphone"" r..."
3,"{'id': 787177201, 'id_str': '787177201', 'name...",Fri Nov 27 19:23:04 +0000 2020,RT @smoleart: Shop update!!! * u * https://t.c...,"<a href=""https://mobile.twitter.com"" rel=""nofo..."
4,"{'id': 368406590, 'id_str': '368406590', 'name...",Fri Nov 27 19:23:04 +0000 2020,RT @KelemenCari: Our next election should look...,"<a href=""http://twitter.com/download/iphone"" r..."


In [15]:
#extract hashtags
import re
hashtags = []
for i in dfTwitter['text']:
    #print(i)
    hashtags.append(re.findall(r"#(\w+)", i))
    
dfTwitter['hashtags'] = hashtags

In [16]:
dfTwitter['hashtags'].describe()

count     5001
unique     312
top         []
freq      4357
Name: hashtags, dtype: object

In [17]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    #text = re.sub('[0-9]+', '', text)
    return text

In [18]:
dfTwitter['text'] = dfTwitter['text'].apply(lambda x: remove_punct(x))
dfTwitter.head(10)

Unnamed: 0,user,created_at,text,source,hashtags
0,"{'id': 1210574876156813312, 'id_str': '1210574...",Fri Nov 27 19:23:01 +0000 2020,RT freelilfairy the universe \n ★ ...,"<a href=""http://twitter.com/download/iphone"" r...",[]
1,"{'id': 1231777653859876865, 'id_str': '1231777...",Fri Nov 27 19:23:02 +0000 2020,When looking at all vote batches the Michigan ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",[]
2,"{'id': 1086725947074797573, 'id_str': '1086725...",Fri Nov 27 19:23:04 +0000 2020,RT HOThits1005 🚨 BLACK FRIDAY DEAL ALERT 🚨\n\n...,"<a href=""http://twitter.com/download/iphone"" r...",[]
3,"{'id': 787177201, 'id_str': '787177201', 'name...",Fri Nov 27 19:23:04 +0000 2020,RT smoleart Shop update u httpstco2ATBkC5xMS,"<a href=""https://mobile.twitter.com"" rel=""nofo...",[]
4,"{'id': 368406590, 'id_str': '368406590', 'name...",Fri Nov 27 19:23:04 +0000 2020,RT KelemenCari Our next election should look l...,"<a href=""http://twitter.com/download/iphone"" r...",[]
5,"{'id': 948050908906221573, 'id_str': '94805090...",Fri Nov 27 19:23:06 +0000 2020,RT YungbludGains if you \n °\n ...,"<a href=""http://twitter.com/download/android"" ...",[]
6,"{'id': 1293045413918806021, 'id_str': '1293045...",Fri Nov 27 19:23:07 +0000 2020,RT PrincessAmunra 💰Only 12 a month for you to ...,"<a href=""http://twitter.com/download/iphone"" r...",[]
7,"{'id': 1267158701170995200, 'id_str': '1267158...",Fri Nov 27 19:23:08 +0000 2020,You Gotta Hear Gretas Teen Rival httpstcoVtp...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",[]
8,"{'id': 17947360, 'id_str': '17947360', 'name':...",Fri Nov 27 19:23:08 +0000 2020,AuditTheVote and TeamSidney and MAGA and MAGA2...,"<a href=""http://twitter.com/download/iphone"" r...","[AuditTheVote, TeamSidney, MAGA, MAGA2020, Tea..."
9,"{'id': 961533379, 'id_str': '961533379', 'name...",Fri Nov 27 19:23:10 +0000 2020,RT freelilfairy the universe \n ★ ...,"<a href=""http://twitter.com/download/iphone"" r...",[]


In [19]:
#Tag Retweets
retweets = []
for i in dfTwitter['text']:
    if 'RT' in i:
        retweets.append(1)
    else:
        retweets.append(0)
dfTwitter['Retweet'] = retweets

In [20]:
#remove stopwords and convert create consistent format
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english'))
stop_words.add('I')
nltk.download('punkt')
processed_text = []
wordlist = []
for i in dfTwitter['text']:
    filtered_sentence = []
    word_tokens = word_tokenize(i.lower())  

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    wordlist.append(filtered_sentence)
    processed_text.append(' '.join([str(elem) for elem in filtered_sentence]))
dfTwitter['clean_text'] = processed_text
dfTwitter['words'] = wordlist

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\15856\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
analyzer = SentimentIntensityAnalyzer()

In [22]:
dfTwitter.head()

Unnamed: 0,user,created_at,text,source,hashtags,Retweet,clean_text,words
0,"{'id': 1210574876156813312, 'id_str': '1210574...",Fri Nov 27 19:23:01 +0000 2020,RT freelilfairy the universe \n ★ ...,"<a href=""http://twitter.com/download/iphone"" r...",[],1,rt freelilfairy universe ★ ✨ 🌕 🌍 • • 🪐 •…,"[rt, freelilfairy, universe, ★, ✨, 🌕, 🌍, •, •,..."
1,"{'id': 1231777653859876865, 'id_str': '1231777...",Fri Nov 27 19:23:02 +0000 2020,When looking at all vote batches the Michigan ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",[],0,looking vote batches michigan 114 631 update f...,"[looking, vote, batches, michigan, 114, 631, u..."
2,"{'id': 1086725947074797573, 'id_str': '1086725...",Fri Nov 27 19:23:04 +0000 2020,RT HOThits1005 🚨 BLACK FRIDAY DEAL ALERT 🚨\n\n...,"<a href=""http://twitter.com/download/iphone"" r...",[],1,rt hothits1005 🚨 black friday deal alert 🚨 hit...,"[rt, hothits1005, 🚨, black, friday, deal, aler..."
3,"{'id': 787177201, 'id_str': '787177201', 'name...",Fri Nov 27 19:23:04 +0000 2020,RT smoleart Shop update u httpstco2ATBkC5xMS,"<a href=""https://mobile.twitter.com"" rel=""nofo...",[],1,rt smoleart shop update u httpstco2atbkc5xms,"[rt, smoleart, shop, update, u, httpstco2atbkc..."
4,"{'id': 368406590, 'id_str': '368406590', 'name...",Fri Nov 27 19:23:04 +0000 2020,RT KelemenCari Our next election should look l...,"<a href=""http://twitter.com/download/iphone"" r...",[],1,rt kelemencari next election look like early v...,"[rt, kelemencari, next, election, look, like, ..."


In [23]:
# This function is where I perform the sentiment analysis on each tweets form the Spark 
num = 0
myList = []
for sentence in dfTwitter['clean_text']:
    #print( num, " " , analyzer.polarity_scores(sentence))
    myList.append(analyzer.polarity_scores(sentence))
    num +=1

In [24]:
len(myList)

5001

In [25]:
dfTwitter["Result"] =  myList

In [26]:
dfTwitter.dtypes

user          object
created_at    object
text          object
source        object
hashtags      object
Retweet        int64
clean_text    object
words         object
Result        object
dtype: object

In [27]:
mydict = dict(dfTwitter["Result"])

In [28]:
dfTwitter["Analysis Result"] = 0

In [29]:
# decide sentiment as positive, negative and neutral 
for i in mydict:
    if mydict[i]['compound'] >= 0.05 : 
#         print (dfTwitter["text"].iloc[i])
#         print("----Tweet is positive, according to the score", mydict[i]['pos'])
#         print("***", mydict[i], "***\n")
        dfTwitter["Analysis Result"].iloc[i]= "positive"
  
    elif mydict[i]['compound'] <= - 0.05 : 
#         print (dfTwitter["text"].iloc[i])
#         print("----Tweet is negative, according to the score", mydict[i]['neg'])
#         print("***", mydict[i], "***\n")
        dfTwitter["Analysis Result"].iloc[i]= "negative"
  
    else : 
#         print (dfTwitter["text"].iloc[i])
#         print("----Tweet is neutral, according to the score", mydict[i]['neu'])
#         print("***", mydict[i], "***\n")
        dfTwitter["Analysis Result"].iloc[i]= "neutral"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [30]:
dfTwitter = dfTwitter.explode('words')
dfTwitter = dfTwitter.explode('hashtags')

In [31]:
dfTwitter.reset_index(inplace=True)

In [32]:
dfTwitter.rename({'index':'TweetID'}, axis = 1)

Unnamed: 0,TweetID,user,created_at,text,source,hashtags,Retweet,clean_text,words,Result,Analysis Result
0,0,"{'id': 1210574876156813312, 'id_str': '1210574...",Fri Nov 27 19:23:01 +0000 2020,RT freelilfairy the universe \n ★ ...,"<a href=""http://twitter.com/download/iphone"" r...",,1,rt freelilfairy universe ★ ✨ 🌕 🌍 • • 🪐 •…,rt,"{'neg': 0.0, 'neu': 0.85, 'pos': 0.15, 'compou...",positive
1,0,"{'id': 1210574876156813312, 'id_str': '1210574...",Fri Nov 27 19:23:01 +0000 2020,RT freelilfairy the universe \n ★ ...,"<a href=""http://twitter.com/download/iphone"" r...",,1,rt freelilfairy universe ★ ✨ 🌕 🌍 • • 🪐 •…,freelilfairy,"{'neg': 0.0, 'neu': 0.85, 'pos': 0.15, 'compou...",positive
2,0,"{'id': 1210574876156813312, 'id_str': '1210574...",Fri Nov 27 19:23:01 +0000 2020,RT freelilfairy the universe \n ★ ...,"<a href=""http://twitter.com/download/iphone"" r...",,1,rt freelilfairy universe ★ ✨ 🌕 🌍 • • 🪐 •…,universe,"{'neg': 0.0, 'neu': 0.85, 'pos': 0.15, 'compou...",positive
3,0,"{'id': 1210574876156813312, 'id_str': '1210574...",Fri Nov 27 19:23:01 +0000 2020,RT freelilfairy the universe \n ★ ...,"<a href=""http://twitter.com/download/iphone"" r...",,1,rt freelilfairy universe ★ ✨ 🌕 🌍 • • 🪐 •…,★,"{'neg': 0.0, 'neu': 0.85, 'pos': 0.15, 'compou...",positive
4,0,"{'id': 1210574876156813312, 'id_str': '1210574...",Fri Nov 27 19:23:01 +0000 2020,RT freelilfairy the universe \n ★ ...,"<a href=""http://twitter.com/download/iphone"" r...",,1,rt freelilfairy universe ★ ✨ 🌕 🌍 • • 🪐 •…,✨,"{'neg': 0.0, 'neu': 0.85, 'pos': 0.15, 'compou...",positive
...,...,...,...,...,...,...,...,...,...,...,...
73205,5000,"{'id': 2844739192, 'id_str': '2844739192', 'na...",Fri Nov 27 20:46:14 +0000 2020,RT EntheosShines TO BANNED CONTENT CREATORS \n...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,1,rt entheosshines banned content creators chann...,announce,"{'neg': 0.188, 'neu': 0.812, 'pos': 0.0, 'comp...",negative
73206,5000,"{'id': 2844739192, 'id_str': '2844739192', 'na...",Fri Nov 27 20:46:14 +0000 2020,RT EntheosShines TO BANNED CONTENT CREATORS \n...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,1,rt entheosshines banned content creators chann...,anyone,"{'neg': 0.188, 'neu': 0.812, 'pos': 0.0, 'comp...",negative
73207,5000,"{'id': 2844739192, 'id_str': '2844739192', 'na...",Fri Nov 27 20:46:14 +0000 2020,RT EntheosShines TO BANNED CONTENT CREATORS \n...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,1,rt entheosshines banned content creators chann...,may,"{'neg': 0.188, 'neu': 0.812, 'pos': 0.0, 'comp...",negative
73208,5000,"{'id': 2844739192, 'id_str': '2844739192', 'na...",Fri Nov 27 20:46:14 +0000 2020,RT EntheosShines TO BANNED CONTENT CREATORS \n...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,1,rt entheosshines banned content creators chann...,upload,"{'neg': 0.188, 'neu': 0.812, 'pos': 0.0, 'comp...",negative


In [33]:
import json
from pymongo import MongoClient

In [34]:
# how to connect to the mongodb instance on the computer with port and addr
conn = MongoClient('localhost', 27017)

In [35]:
# looking for databases on my mongodb
cursor = conn.list_databases()
for db in cursor:
    print(db)

{'name': 'Classproject', 'sizeOnDisk': 31547392.0, 'empty': False}
{'name': 'admin', 'sizeOnDisk': 184320.0, 'empty': False}
{'name': 'config', 'sizeOnDisk': 36864.0, 'empty': False}
{'name': 'local', 'sizeOnDisk': 81920.0, 'empty': False}


In [36]:
# connecting to the database 
db = conn.Classproject

In [37]:
# looking for the collecction on this database
#print(db.list_collection_names(include_system_collections=False) )

In [38]:
# showing all the columns names
dfTwitter.columns

Index(['index', 'user', 'created_at', 'text', 'source', 'hashtags', 'Retweet',
       'clean_text', 'words', 'Result', 'Analysis Result'],
      dtype='object')

In [39]:
# Inserting data into the mongodb database
data_dict= dfTwitter.to_dict("records")
db.Tweets.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x1b367580b00>

In [40]:
# presenting the data from mongodb using the result from neutral evaluation. 
#for x in db.Tweets.find({},{"_id":0,"text": 1, "Analysis Result":"neutral"}):
#    print(x)

In [41]:
from bson.son import SON

In [42]:
pipeline = [
    {"$unwind": "$Analysis Result"},
    {"$group": {"_id": "$Analysis Result", "count": {"$sum": 1}}},
    {"$sort": SON([("count", -1), ("_id", -1)])}
]

In [43]:
import pprint
pprint.pprint(list(db.Tweets.aggregate(pipeline)))

[{'_id': 'positive', 'count': 109236},
 {'_id': 'neutral', 'count': 76194},
 {'_id': 'negative', 'count': 34200}]


In [44]:
#for x in db.Tweets.find({},{"_id":0,"text": 1, "Analysis Result":"negative"}):
#    print(x)