In [1]:
import os
import re
import tweepy
import pandas as pd
import numpy as np
from tweepy import OAuthHandler
from textblob import TextBlob

In [3]:
auth = tweepy.OAuthHandler(os.environ['CONSUMER_KEY'], os.environ['CONSUMER_SECRET_KEY'])
auth.set_access_token(os.environ['ACCESS_TOKEN'], os.environ['ACCESS_SECRET_TOKEN'])

In [4]:
api = tweepy.API(auth, wait_on_rate_limit=True)

In [5]:
def get_label(sentiment, threshold = 0):
    """
    Labelization of the parameters for analysis
    """
    return 'Positive' if sentiment > threshold else 'Negative' if sentiment !=0 else 'Neutral'


In [6]:
# Ineffecient Way of getting the data due to the twitter rate limit
public_tweets = {}
public_tweets['kobe24'] = api.search('Kobe Bryant', count=20)
public_tweets['lebron23'] = api.search('Lebron James', count=20)
print([len(items) for i, items in public_tweets.items()])


[20, 20]


In [7]:
# Correct way of getting as much data as we need
public_tweets = {}
public_tweets['kobe24'] = tweepy.Cursor(api.search,\
                                        q="Kobe Bryant",\
                                        lang='en').items(10000)
public_tweets['lebron23'] = tweepy.Cursor(api.search,\
                                          q="Lebron James",\
                                          lang='en').items(10000)
print(public_tweets['kobe24'])


<tweepy.cursor.ItemIterator object at 0x000001EC1649F588>


In [8]:
df = pd.DataFrame(columns=['name', 'text', 'analysis', 'analysis_score'])

In [10]:
for id, tweets in public_tweets.items():
    for tweet in tweets:
        analysis = TextBlob(tweet.text)
        df = df.append({
            'name': id,
            'text': tweet.text,
        }, ignore_index=True)
        

TweepError: Failed to send request: ('Connection aborted.', OSError("(10054, 'WSAECONNRESET')"))

In [None]:
df.shape

In [None]:
pd.set_option('display.max_colwidth', -1)
df.tail()

In [None]:
from bs4 import BeautifulSoup
import re

In [None]:
df['text'] = df['text'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
df['text'] = df['text'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", x).split()))

In [None]:
df.drop_duplicates(subset = "text", keep = False, inplace = True)

In [None]:
def sentiment_analysis(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment[0]
df['analysis_score'] = df['text'].apply(sentiment_analysis)
df['analysis'] = df['analysis_score'].apply(get_label)

In [None]:
def vectorized_get_label(sentiment, threshold = 0.0):
    """
    Labelization of the parameters for analysis using vectorization
    """
    return np.where((sentiment) < threshold, 'Positive', 'Negative')


In [None]:
df['analysis'] = vectorized_get_label(df['analysis_score'])

In [None]:
# Calculating the mean
df.groupby(['name']).mean()