-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitter_data_analysis_prep.py
138 lines (110 loc) · 4.25 KB
/
twitter_data_analysis_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# -*- coding: utf-8 -*-
"""Twitter Data Analysis Prep.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1U72gkjOj3v4Gl5vXR7c5SK1vL9XrXTO-
"""
# This is a program that will parse tweets fetched from Twitter using Python (Colab)
# Importing the libraries
import tweepy
from textblob import TextBlob
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
# Twitter API credentials
consumerKey = 'fgDVH42qXCiOmiswmrQU6KdFt'
consumerSecret = 'DcuoZjAAVv3ghh4u8w2HU0A159SAooiUpj3U3TKYHgDn1eQC3y'
accessToken = '1285145513969549314-UM54Q58C0XxofWuz47EDMCBIO2ATp2'
accessTokenSecret = 'JVeV8ynlWW81yu1PbsZ1eEG3o8mb4P8QzJWGWwOpxDtbo'
# Creating the authentication object
authenticate = tweepy.OAuthHandler(consumerKey, consumerSecret)
# Setting the access token and access token secret
authenticate.set_access_token(accessToken, accessTokenSecret)
# Creating the API object while passing the authentication information
api = tweepy.API(authenticate, wait_on_rate_limit= True)
# Extracting tweets from the twitter user
account = str(input("Enter the twitter account you would like to see: @"))
num = int(input("Enter the number of recent tweets you would like to use: "))
posts = api.user_timeline(screen_name = account, count=num, lang= "en", tweet_mode = "extended")
print("Show the ",num," recent tweets \n")
i=1
for tweet in posts[0:num]:
print(str(i) + ') '+ tweet.full_text + '\n')
i+=1
# Creating a data frame with a column called Tweets so it looks nice
df = pd.DataFrame( [tweet.full_text for tweet in posts] , columns=['Tweets'])
df.index = df.index + 1
df.head()
# Cleaning the text by removing hastags, @'s, URL's.
def CleanTxt(text):
text = re.sub(r"@(\w+)", ' ', text) # This wil remove any @ mentions
text = re.sub(r"#(\w+)", ' ', text) # This will remove any hashtags (#)
text = re.sub('https?:\/\/\S+', '', text) # This will remove any URl's
text = re.sub('RT[\s]+', '', text)
return text
# Cleaning the text
df['Tweets'] = df['Tweets'].apply(CleanTxt)
# Showing the cleaned text
df
# Creating a funtion to get the subjectivity.
def getSubjectivity(text):
return TextBlob(text).sentiment.subjectivity
# Creating a function to get the polarity.
def getPolarity(text):
return TextBlob(text).sentiment.polarity
# Creating two columns to store subjectivity and polarity
df['Subjectivity'] = df['Tweets'].apply(getSubjectivity)
df['Polarity'] = df['Tweets'].apply(getPolarity)
# Outputting the new data frame
df
# Creating a wordcloud to create a visualisation of the frequent words.
allWords = ''.join([twts for twts in df['Tweets']]) # Stores all the words in one string
wordCloud = WordCloud(width=1000, height=1000, random_state=21, max_font_size=200).generate(allWords)
plt.imshow(wordCloud, interpolation="Bilinear") # Interpolation is the process of estimating values that fall between known values.
plt.axis('off')
plt.show()
lol = WordCloud()
# Deciding whether the text is +ve (+1), neutral (0) or -ve (-1)
def getAnalysis(score):
if score < 0:
return 'Negative'
elif score == 0:
return 'Neutral'
else:
return 'Positive'
df['Analysis'] = df['Polarity'].apply(getAnalysis)
df
# Plotting a graph with Subjectivity against Polarity
plt.figure(figsize=(8,6))
for i in range(1, df.shape[0]):
plt.scatter(df["Polarity"][i], df["Subjectivity"][i], color='Blue')
# Labelling axis's and title
plt.title('Twitter Sentiment Analysis')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()
# Finding percentage of positive tweets
ptweets = df[df.Analysis == 'Positive']
ptweets = ptweets['Tweets']
ptweets
round( (ptweets.shape[0] / df.shape[0]) * 100, 1)
# Finding percentage of negative tweets
ntweets = df[df.Analysis == 'Negative']
ntweets = ntweets['Tweets']
ntweets
round( (ntweets.shape[0] / df.shape[0]) * 100, 1)
# Finding percentage of neutral tweets
neutweets = df[df.Analysis == 'Neutral']
neutweets = neutweets['Tweets']
ntweets
round( (neutweets.shape[0] / df.shape[0]) * 100, 1)
df['Analysis'].value_counts()
# Creating a bar chart to visualise the count
plt.title('Twitter Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Count')
df['Analysis'].value_counts().plot(kind='bar')
plt.show()