twitter_data_analysis_prep.py

# -*- coding: utf-8 -*-
"""Twitter Data Analysis Prep.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1U72gkjOj3v4Gl5vXR7c5SK1vL9XrXTO-
"""

# This is a program that will parse tweets fetched from Twitter using Python (Colab)

# Importing the libraries
import tweepy
from textblob import TextBlob
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

# Twitter API credentials
consumerKey = 'fgDVH42qXCiOmiswmrQU6KdFt'
consumerSecret = 'DcuoZjAAVv3ghh4u8w2HU0A159SAooiUpj3U3TKYHgDn1eQC3y'
accessToken = '1285145513969549314-UM54Q58C0XxofWuz47EDMCBIO2ATp2'
accessTokenSecret = 'JVeV8ynlWW81yu1PbsZ1eEG3o8mb4P8QzJWGWwOpxDtbo'

# Creating the authentication object
authenticate = tweepy.OAuthHandler(consumerKey, consumerSecret)
# Setting the access token and access token secret
authenticate.set_access_token(accessToken, accessTokenSecret)
# Creating the API object while passing the authentication information
api = tweepy.API(authenticate, wait_on_rate_limit= True)

# Extracting tweets from the twitter user
account = str(input("Enter the twitter account you would like to see: @"))
num = int(input("Enter the number of recent tweets you would like to use: "))
posts = api.user_timeline(screen_name = account, count=num, lang= "en", tweet_mode = "extended")
print("Show the ",num," recent tweets \n")  
i=1
for tweet in posts[0:num]:
  print(str(i) + ') '+ tweet.full_text + '\n')
  i+=1

# Creating a data frame with a column called Tweets so it looks nice
df = pd.DataFrame( [tweet.full_text for tweet in posts] , columns=['Tweets'])
df.index = df.index + 1
df.head()

# Cleaning the text by removing hastags, @'s, URL's.
def CleanTxt(text):
  text = re.sub(r"@(\w+)", ' ', text) # This wil remove any @ mentions
  text = re.sub(r"#(\w+)", ' ', text) # This will remove any hashtags (#)
  text = re.sub('https?:\/\/\S+', '', text) # This will remove any URl's
  text = re.sub('RT[\s]+', '', text)
  return text

# Cleaning the text
df['Tweets'] = df['Tweets'].apply(CleanTxt)

# Showing the cleaned text
df

# Creating a funtion to get the subjectivity.
def getSubjectivity(text):
  return TextBlob(text).sentiment.subjectivity

# Creating a function to get the polarity. 
def getPolarity(text):
  return TextBlob(text).sentiment.polarity

# Creating two columns to store subjectivity and polarity
df['Subjectivity'] = df['Tweets'].apply(getSubjectivity)
df['Polarity'] = df['Tweets'].apply(getPolarity)

# Outputting the new data frame
df

# Creating a wordcloud to create a visualisation of the frequent words.
allWords = ''.join([twts for twts in df['Tweets']]) # Stores all the words in one string
wordCloud = WordCloud(width=1000, height=1000, random_state=21, max_font_size=200).generate(allWords)

plt.imshow(wordCloud, interpolation="Bilinear") # Interpolation is the process of estimating values that fall between known values. 
plt.axis('off')
plt.show()
      
lol = WordCloud()

# Deciding whether the text is +ve (+1), neutral (0) or -ve (-1)
def getAnalysis(score):
  if score < 0:
    return 'Negative'
  elif score == 0:
    return 'Neutral'
  else:
    return 'Positive'

df['Analysis'] = df['Polarity'].apply(getAnalysis)
df

# Plotting a graph with Subjectivity against Polarity
plt.figure(figsize=(8,6))
for i in range(1, df.shape[0]):
  plt.scatter(df["Polarity"][i], df["Subjectivity"][i], color='Blue')
# Labelling axis's and title
plt.title('Twitter Sentiment Analysis')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()

# Finding percentage of positive tweets
ptweets = df[df.Analysis == 'Positive']
ptweets = ptweets['Tweets']
ptweets
round( (ptweets.shape[0] / df.shape[0]) * 100, 1)

# Finding percentage of negative tweets
ntweets = df[df.Analysis == 'Negative']
ntweets = ntweets['Tweets']
ntweets

round( (ntweets.shape[0] / df.shape[0]) * 100, 1)

# Finding percentage of neutral tweets
neutweets = df[df.Analysis == 'Neutral']
neutweets = neutweets['Tweets']
ntweets

round( (neutweets.shape[0] / df.shape[0]) * 100, 1)

df['Analysis'].value_counts()

# Creating a bar chart to visualise the count
plt.title('Twitter Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Count')
df['Analysis'].value_counts().plot(kind='bar')
plt.show()