# 1. Data Analysis of the Influencers:
# 2. Sentiment Analysis of the Influencers: 


In [None]:
# Import the Dependencies
import tweepy
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import time
from datetime import datetime, timezone
# import seaborn as sns
from os import path, makedirs # fetch path and makedirs function from os file
import csv # fetch csv file
from glob import glob # fetching glob function only from the glob lib

In [None]:
# Import keys from the config file
from config import consumer_key, consumer_secret, access_token, access_token_secret

In [None]:
# # Twitter API Keys
# consumer_key = 'Your Key'
# consumer_secret = 'Your Key'
# access_token = 'Your Key'
# access_token_secret = 'Your Key'

In [None]:
# Import and Initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

In [None]:
# Load and read csv file containing the details of the Influencers 
influencer_data_load = "RawData/SentimentInfluencerInputData.csv"
influencer_data_read = pd.read_csv(influencer_data_load)

In [None]:
# for index, row in Influencers_DF.iterrows():
Data_Influencers_DF = influencer_data_read
Not_Found = 0

#SS - define variable for holding tweets for influencer

all_tweet_listing = []

print("-----------Start extraction!!!-----------")

for index, row in Data_Influencers_DF.iterrows():
    target_user = row["Twitter_Handle"]
    Genre =  row["Genre"]
    
    try:
        
        #SS - retrieve top 200 tweets for influencer
        
        public_tweets = api.user_timeline(target_user, count=200, result_type="recent")

        #SS - write to all_tweet_listing

        for tweet in public_tweets:
            
            Date = datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%m/%d/%Y')

            all_tweet_listing.append({"Influencer":target_user,
                               "Date": Date,
                               "Genre":Genre,
                               "Tweet":tweet["text"]})
    
        user_account = api.get_user(target_user)
        
        user_geo_enabled = user_account["geo_enabled"]
        
        if (user_geo_enabled == True):
            Data_Influencers_DF.at[index, "Loc"] = user_account["location"]
        else:
            Data_Influencers_DF.at[index, "Loc"] = 'NA'

        if (user_account["lang"] == 'en'):
            Data_Influencers_DF.at[index, "Lang"] = 'Eng'
        else:
            Data_Influencers_DF.at[index, "Lang"] = 'NA'
        
        Data_Influencers_DF.at[index, "Created On"] = datetime.strptime(user_account['created_at'],'%a %b %d %H:%M:%S %z %Y').strftime('%m/%d/%Y')
        
        Data_Influencers_DF.at[index, "Age Of Account"] = (datetime.now(timezone.utc) - datetime.strptime(user_account['created_at'],'%a %b %d %H:%M:%S %z %Y')).days
        
        
       #  Data_Influencers_DF.at[index, "Real Name"] = user_real_name
        Data_Influencers_DF.at[index, "Tweets"] = user_account["statuses_count"]
        Data_Influencers_DF.at[index, "Followers"] = user_account["followers_count"]
        Data_Influencers_DF.at[index, "Following"] = user_account["friends_count"]
        Data_Influencers_DF.at[index, "Favorites Count"] = user_account["favourites_count"]
        
              
    
    except tweepy.TweepError as e:
        Not_Found = Not_Found + 1
        print(f"exception for {row['Twitter_Handle']}: {e}")

print("----------- Extraction Complete !!!-----------")        
print(Not_Found)

In [None]:
#SS -translate dict to a dataframe
tweet_listing_pd = pd.DataFrame.from_dict(all_tweet_listing)

In [None]:
#SS -stats for tweet listing for influencers. PLEASE DON'T REMOVE. Required to quantify digital footprint!
tweet_listing_pd.info()

In [None]:
#SS -top 1000 key words from Influencer tweets

Top_1000 = pd.Series(' '.join(tweet_listing_pd['Tweet']).lower().split()).value_counts()[:1000]

In [None]:
Top_1000.to_csv("RawData/Top_1000_keywords.csv", index=True, header=True)

In [None]:
#SS - Write to CSV for analysis
tweet_listing_pd.to_csv("RawData/TweetListings.csv", index=False, header=True)

In [None]:
# #SS - define target tags for social and entertainment

# social_target_tags = ["#FamiliesBelongTogetherMarch","#gun","gun","shooting","gun-control","election","#metoo","metoo","FamiliesBelongTogetherMarch","PrideMonth","#PrideMonth","FamiliesBelongTogether","ChildreninCages","UniteTheFamilies","WeCare"]

# entertainment_target_tags = ["#SocialMediaDay","SocialMediaDay","WorldCup","#WorldCup","#fifa","fifa", "#worldcup2018russia","#PostASongLyricYouLove"]

# #SS - define lists to hold tweets based on tags

# social_tweet_list_dict = []

# entertainment_tweet_list_dict = []

# for index, row in tweet_listing_pd.iterrows():
    
#     tweet_listing_filtered = [tweet_listing_pd[tweet_listing_pd['Tweet'].str.contains(x)] for x in target_tags]
    
#     for tweet_list in tweet_listing_filtered:
#         Date = tweet_list["Date"]
#         Genre = tweet_list["Genre"]
#         Influencer = tweet_list["Influencer"]
#         Tweet = tweet_list["Tweet"]
#         tweet_list_dict.append({
#             "Data" : Data,
#             "Genre":Genre,
#             "Influencer": Influencer,
#             "Tweet" : Tweet
#         })

In [None]:
# Remove missing values.
Data_Influencers_DF.dropna()

In [None]:
# Grouped on Genre 
Data_Influencers_DF['Average Tweets'] = Data_Influencers_DF['Tweets']
AggregatedGenre= Data_Influencers_DF.groupby(["Genre"]).agg({'Genre': 'count', 'Tweets': 'sum', 'Followers': 'sum','Average Tweets':'mean','Age Of Account':'mean'})
AggregatedGenre
AggregatedGenre.sort_values(['Followers'], ascending=False)


In [None]:
# Sorting on Average Tweets and grouped on Genere
AggregatedGenre.sort_values(['Average Tweets'],ascending=False)

In [None]:
# People who are top ten Influencer who tweet more
top_ten_twitters = Data_Influencers_DF.sort_values(['Tweets'],ascending=False).head(10)
top_ten_twitters

In [None]:
# Top ten Influencer who who have more Followers
top_ten_influencer = Data_Influencers_DF.sort_values(['Followers'],ascending=False).head(10)
top_ten_influencer

In [None]:
# Still Working test cell
# to find the handle of followers of the top most 
# active Influencer so that we can send the tweet through a bot.
# top_ten_twitters.head(1)['Twitter_Handle'].map(lambda x: x.lstrip('@'))
# import time
# ids = []
# for page in tweepy.Cursor(api.followers_ids, top_ten_twitters.head(1)['Twitter_Handle'].map(lambda x: x.lstrip('@'))).pages():
#     ids.extend(page)
#     time.sleep(60)

# 2. Sentiment Analysis of the Influencers: 

In [None]:
###
# Influencers = influencer_data_read
# s = Influencers.iloc[:,0]


In [None]:
# # Extract the first 100 tweets of the Influencers
print("-----------Start extraction of the tweets posted by the Influencers!!!-----------")
Influencers = []
Influencers = top_ten_twitters
# Influencers = influencer_data_read
# Influencers_th = Influencers.iloc[:,0]
# # Array to hold the sentiments
# Tweet_array = []
Sentiment_array = []

# for user in Influencers[0]:
# for user in Influencers_th:
for user in Influencers:
    # Set the tweet count to 100
    tweet_count = 100
    print("Extracting tweets from %s"%user)
    
    # Extract tweets up to 5 pages
    for x in range(1):
        influencer_tweets = api.user_timeline(user,page = 1)       
        
#         # For each tweet in a bunch of public tweets
for tweet in influencer_tweets:
            
            #Calculate the compound, positive, negative and neutral values of each tweet
            compound = analyzer.polarity_scores(tweet["text"])["compound"]
            pos = analyzer.polarity_scores(tweet["text"])["pos"]
            neu = analyzer.polarity_scores(tweet["text"])["neu"]
            neg = analyzer.polarity_scores(tweet["text"])["neg"]
            
            # Save the Tweets in an array as a dictionery item 
            Sentiment_array.append({"Influencers" : user,
                                    "Tweet Text" : tweet["text"],
                                    "Compound" : compound,
                                    "Positive" : pos,
                                    "Negative" : neg,
                                    "Neutral" : neu,
                                    "Date" : tweet["created_at"],
                                    "Tweets Ago" : tweet_count
                                   })
            
            #Decrease count of tweet by 1 in the reverse order
            tweet_count -= 1

print("-----------End of Extraction of Tweets !!!-----------")


In [None]:
# # Create dataframe from the Dictionery item of the Sentiment Array
# Sentiment_DF = pd.DataFrame.from_dict(Sentiment_array)

# # Remove the '@' from the 'influence' column in the data frame
# Sentiment_DF["Influencers"] = Sentiment_DF["Influencers"].map(lambda x: x.lstrip('@'))

# # Re_arrang the columns and save into a CSV file
# Sentiment_DF = Sentiment_DF[["Influencers", "Date", "Tweet Text"
#                              , "Compound", "Positive", "Negative"
#                              , "Neutral", "Tweets Ago"
#                             ]]

# # Store output in a .CSV File
# Sentiment_DF.to_csv("influencer_tweets_Analysis.csv")

# # Sentiment_DF

In [None]:
# # Calculate the mean for each Influencers & store into a dataframe
# Influencers_Comp_Mean = Sentiment_DF.groupby("Influencers").mean()["Compound"].to_frame()

# #Reset the index 
# Influencers_Comp_Mean.reset_index(inplace=True)

# Influencers_Comp_Mean

In [None]:
# # Set the values for x_axis & y_axis
# x_axis = Influencers_Comp_Mean.index.values
# y_axis = Influencers_Comp_Mean["Compound"]
# X_Label = ["@Oprah","@KimKardashian","@realDonaldTrump","@justinbieber","@KylieJenner"]

# # Intialize the plots. 
# fig,ax = plt.subplots()#  function that returns a tuple containing a figure and axes object(s)

# #Set the plot and assign the values like colors etc
# bars = ax.bar(x_axis,y_axis
#               , align = "edge"
#               , width = 1
#               , linewidth = 1
#               , edgecolor = 'black'
#               , color = ["yellow","lime","red","orange","pink"]
#              )

# # Set the tick(s) of the bar graph
# tick_locations = [value + 0.5 for value in range(len(x_axis))]
# plt.xticks(tick_locations,X_Label,rotation='vertical')

# # If value is positive then put True in the Summary else place False
# Influencers_Comp_Mean["Positive"] = Influencers_Comp_Mean["Compound"] > 0

# # Assign the height based on positive value after allocating True / false value
# height = Influencers_Comp_Mean.Positive.map({True: 0.03 , False: -0.03})

# # # Set the value on labels on the bars
# for bar in bars:
#     ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + height[bars.index(bar)]
#             , round(Influencers_Comp_Mean["Compound"][bars.index(bar)],3)
#             , ha = 'center'
#             , va = 'bottom'
#             )

# # Set the x_axis limits
# ax.set_xlim(0, len(x_axis))

# # Dynamically set the y_axis limits by finding the max & min value of y-axis
# ax.set_ylim(min(y_axis)-0.1, max(y_axis) + 0.1)

# # Set a horizontal line at y = 0
# plt.hlines(0,0,len(x_axis))

# # Title of the graph
# ax.set_title("Sentiments on Twitter of Influencers (%s)" % (time.strftime("%x")), fontsize=16)

# # Setting the y_axis label
# ax.set_ylabel("Polarity on Twitter ", fontsize=14)

# # # Setting the x_axis label
# ax.set_xlabel("The Influencers", fontsize=14)
  
# # Saving the graph
# plt.savefig("The Influencer Twitter Sentiment .png",bbox_inches='tight')
# plt.show()

In [None]:
# # Create an array of Influencers Houses with the unique function in the data frame
# Influencers_array = Sentiment_DF["Influencers"].unique()
# Influencers_array

# # #Plotting the graph for each influencer
# for influencer in Influencers_array:
# # Creating a temporary data frame to store for only one influencer at a time
#         Temp_DF = Sentiment_DF[Sentiment_DF["influencer"] == influencer]
        
#         Sentiment_DF['influencer'] = Sentiment_DF['influencer'].map(lambda x: x.lstrip('@'))
# #Temp_DF
    
#         plt.scatter(Temp_DF["Tweets Ago"],Temp_DF["Compound"]
#                  , marker = "o", linewidth = 0, alpha = 0.8, label = Influencers
#                  , facecolors = Temp_DF.influencer.map({"@Oprah": "blue"
#                                                    , "@KimKardashian" : "lime"
#                                                    , "@realDonaldTrump": 'indigo'
#                                                    , "@justinbieber":"fuchsia"
#                                                    , "@KylieJenner":"gold"
#                                                   })
#                 )

# # # Set the legend 
# plt.legend(bbox_to_anchor = (1,1), title="The Influencers", loc='best')

# # # Set the labels of x_axis, y_axis & title 
# plt.xlabel("Tweets Ago", fontsize=12)
# plt.ylabel("Tweet Polarity", fontsize=12)
# plt.title("Sentiment Analysis of The Influencers Tweets (%s)" % (time.strftime("%x")), fontsize=16)

# # #Set the limite of  x_axis and y_axis
# plt.xlim(0, 101)
# plt.ylim(-1,1)

# # # Set the grid
# plt.grid(True)

# filePath = 'Images'
# if not path.exists(filePath):
#     makedirs(filePath)

# # Save the result to a .png file
# plt.savefig("Sentiment Analysis of Influencers Tweets.png",bbox_inches='tight')
# # plt.savefig("Sentiment Analysis of The influencer's Tweets.png",bbox_inches='tight')

# plt.show()

In [None]:
# Version: 2.0
# Date: Sunday 7/1
# Time: 03:40 PM
# Functionalities: 
# A) Data Analysis
#     1. Created dataframe.
#     2. Sorting
# B) Sentiment Analysis


# Pending items:
# Data Cleaning