In [2]:
import os
import re
import tweepy
import pandas as pd
import numpy as np
from tweepy import OAuthHandler
from textblob import TextBlob

In [3]:
auth = tweepy.OAuthHandler(os.environ['CONSUMER_KEY'], os.environ['CONSUMER_SECRET_KEY'])
auth.set_access_token(os.environ['ACCESS_TOKEN'], os.environ['ACCESS_SECRET_TOKEN'])

In [4]:
api = tweepy.API(auth, wait_on_rate_limit=True)

In [5]:
def get_label(sentiment, threshold = 0):
    """
    Labelization of the parameters for analysis
    """
    return 'Positive' if sentiment > threshold else 'Negative' if sentiment !=0 else 'Neutral'


In [6]:
# Ineffecient Way of getting the data due to the twitter rate limit
public_tweets = {}
public_tweets['kobe24'] = api.search('Kobe Bryant', count=20)
public_tweets['lebron23'] = api.search('Lebron James', count=20)
print([len(items) for i, items in public_tweets.items()])


[20, 15]


In [7]:
# Correct way of getting as much data as we need
public_tweets = {}
public_tweets['kobe24'] = tweepy.Cursor(api.search,\
                                        q="Kobe Bryant",\
                                        lang='en').items(5000)
public_tweets['lebron23'] = tweepy.Cursor(api.search,\
                                          q="Lebron James",\
                                          lang='en').items(5000)
print(public_tweets['kobe24'])


<tweepy.cursor.ItemIterator object at 0x0000020F2BE945C0>


In [8]:
df = pd.DataFrame(columns=['name', 'text', 'analysis', 'analysis_score'])

In [9]:
for id, tweets in public_tweets.items():
    for tweet in tweets:
        analysis = TextBlob(tweet.text)
        df = df.append({
            'name': id,
            'text': tweet.text,
        }, ignore_index=True)
        

In [10]:
df.shape

(10000, 4)

In [11]:
pd.set_option('display.max_colwidth', -1)
df.tail()

Unnamed: 0,name,text,analysis,analysis_score
9995,lebron23,"RT @billoram: “Shit don’t fit right.” Maybe the uniform felt a little snug after a month off, but LeBron James clicked right back into plac…",,
9996,lebron23,RT @SLAMonline: His greatness has become the definition of what a love for the game should look like. 👑\n\nLeBron James covers SLAM 220: http…,,
9997,lebron23,RT @TheSteinLine: NAMED special roster additions ... there will now be 13 players on each of the squads picked by captains LeBron James and…,,
9998,lebron23,RT @asvpxflyy: Attention basketball world. Lebron James is better than Kobe Bryant and Mj... Tell me how without mentioning Rings,,
9999,lebron23,RT @Suns: 24.8 points | 6.7 assists | 3.6 rebounds\n\n@DevinBook joins James Harden and LeBron James as ONLY players in the league averaging…,,


In [12]:
from bs4 import BeautifulSoup
import re

In [13]:
df['text'] = df['text'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
df['text'] = df['text'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", x).split()))

In [14]:
df.drop_duplicates(subset = "text", keep = False, inplace = True)

In [15]:
def sentiment_analysis(tweet):
    analysis = TextBlob(tweet)
    return analysis.sentiment[0]
df['analysis_score'] = df['text'].apply(sentiment_analysis)
df['analysis'] = df['analysis_score'].apply(get_label)

In [16]:
def vectorized_get_label(sentiment, threshold = 0.0):
    """
    Labelization of the parameters for analysis using vectorization
    """
    return np.where((sentiment) < threshold, 'Positive', 'Negative')


In [17]:
df['analysis'] = vectorized_get_label(df['analysis_score'])

In [18]:
df.head()

Unnamed: 0,name,text,analysis,analysis_score
5,kobe24,Kyrie s mentor KOBE BRYANT Who s team would be love to join to emulate his hero THE LOS ANGELES LAKERS,Negative,0.5
7,kobe24,Even Charlotte traded Kobe Bryant,Negative,0.0
14,kobe24,10 years ago today Kobe Bryant of the Los Angeles Lakers scored 61 points in a 126 117 win over the New York Kni,Negative,0.468182
24,kobe24,vs the Lakers signing Kobe Bryant to a Max even after the injury,Negative,0.0
32,kobe24,Kobe Bryant s All Game Winners of his Career 36 via Cheer up,Positive,-0.4


In [19]:
# Calculating the mean
df.groupby(['name']).mean()

Unnamed: 0_level_0,analysis_score
name,Unnamed: 1_level_1
kobe24,0.147418
lebron23,0.105766
