## Ken Jennings Tweets
### 74 Jeopardy Victories


In [19]:
import datetime
import tweepy

# I've put my API keys in a .py file called API_keys.py
from API_keys import api_key, api_key_secret, access_token, access_token_secret


Using Filecache to improve speed
Setting a long timeout allowed for this to be run in multiple segments, picking up where it left off previously using the stored cached values.

In [20]:
cache_dir = "./cache/"
timeout = 7200000

cache = tweepy.FileCache(cache_dir,timeout)

##### Authenticate the Tweepy API

In [21]:

auth = tweepy.OAuthHandler(api_key,api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True,cache=cache)


#### I'm putting the handles in a list to iterate through below

In [22]:

team_handles = ['kenjennings','James_Holzhauer']


# This will iterate through each Twitter handle that we're collecting from
for screen_name in team_handles:
    
    # Tells Tweepy we want information on the handle we're collecting from
    # The next line specifies which information we want, which in this case is the number of status of users
    user = api.get_user(screen_name) 
    statuses_count = user.statuses_count

    # Let's see roughly how long it will take to grab all the Tweets
    print(f'''
    @{screen_name} has {statuses_count} tweets. 
    That will take roughly {statuses_count/(5000*60):.0f} hours and {statuses_count/(5000):.2f} minutes
    ''')


    @kenjennings has 26926 tweets. 
    That will take roughly 0 hours and 5.39 minutes
    

    @James_Holzhauer has 806 tweets. 
    That will take roughly 0 hours and 0.16 minutes
    


In [23]:
# This creates a dictionary containing a list for each Twitter handle we'll be grabbing tweets from
id_dict = {'kenjennings' : [], 'James_Holzhauer' : []}

# Grabs the time when we start making requests to the API
start_time = datetime.datetime.now()

# .keys() allows us to iterate through each key in the dictionary
for handle in id_dict.keys():
    
    # To grab the tweets, we will be using followers_ids
    for page in tweepy.Cursor(api.user_timeline,
                             
                              # Once the rate limit is hit, we will be notified that we must wait 15 mins (900 secs)
                              wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True,
                              tweet_mode="extended",
                              screen_name=handle).pages():

        # The page variable comes back as a list, so we have to use .extend rather than .append
        id_dict[handle].extend(page)
        

# Let's see how long it took to grab all Tweets
end_time = datetime.datetime.now()
elapsed_time = end_time - start_time
print(elapsed_time)

0:00:01.067339


In [33]:
headers = ['id','full_text']

for account in id_dict.keys():
     # Descriptions with emoji or non-Roman letters can cause trouble. Encoding your .txt file in utf-8 will help
    with open(f'{account}_tweets.txt','w', encoding='utf-8') as out_file:
        out_file.write('\t'.join(headers) + '\n')

        for idx, ids in enumerate(id_dict[account]):

            outline = [ids.id,
                       ids.full_text.replace("\n","")]

            out_file.write('\t'.join([f"{str(item):<15}" for item in outline]) + '\n')
                
        

* screen_name
* name
* id
* location
* followers_count
* friends_count
* description


## Analysis

In [26]:
import nltk
import numpy as np

from string import punctuation
from collections import Counter

from pprint import pprint
from nltk.corpus import stopwords

sw=stopwords.words('english')

sw.append('rt')

In [27]:
 def get_patterns(text):
        text_clean=[w for w in text.lower().split()]
        text_clean=[w for w in text_clean if w not in sw]
        text_clean=[w for w in text_clean if w.isalpha()]
        
        total_tokens=len(text_clean)
        unique_tokens=len(set(text_clean))
        
        text_clean_len=[len(w) for w in text_clean]
        
        lex_diversity=len(set(text_clean))/len(text_clean)
        
        top_20=Counter(text_clean).most_common(20)
        
        results={'tokens':total_tokens,
                'unique tokens':unique_tokens,
                'average token length':lex_diversity,
                'top 20':top_20}
        return results

In [28]:
def get_users(text):
        text = text.lower().replace(":","").replace(",","").replace("!","")
        usernames=[u for u in text.split()]
       
    #I am looking for the usernames coming with @
        usernames=[u for u in usernames if u[0] == '@']
        
        total_tokens=len(usernames)
        unique_tokens=len(set(usernames))

        top_30=Counter(usernames).most_common(30)
        
        results={'user mentions':total_tokens,
                'unique user mentions':unique_tokens,
             
                'top 30':top_30}
        return results

In [29]:
#Go through all the tweets from the dictionary i generated and combine the full text for analysis functions
for account in id_dict.keys():
    full_text = ""

    for idx, ids in enumerate(id_dict[account]):
        full_text = full_text + " " + (ids.full_text.replace("\n",""))

    print(account)
    print(get_patterns(full_text))
    print()
    print(get_users(full_text))
    print('----')

kenjennings
{'tokens': 17742, 'unique tokens': 5764, 'average token length': 0.32487881862247775, 'top 20': [('new', 243), ('like', 231), ('one', 134), ('get', 100), ('good', 93), ('time', 87), ('people', 86), ('know', 85), ('think', 85), ('trivia', 79), ('see', 78), ('going', 74), ('first', 68), ('never', 68), ('would', 67), ('every', 66), ('jeopardy', 63), ('last', 62), ('really', 61), ('love', 59)]}

{'user mentions': 2661, 'unique user mentions': 1105, 'top 30': [('@omnibusproject', 197), ('@johnroderick', 171), ('@kenjennings', 54), ('@jeopardy', 50), ('@mental_floss', 45), ('@gameshownetwork', 44), ('@buzztronics', 41), ('@james_holzhauer', 34), ('@jessethorn', 30), ('@billcorbett', 28), ('@andyrichter', 24), ('@paulandstorm', 22), ('@omnibusproject.', 22), ('@bradrutter', 21), ('@andylevy', 21), ('@desijed', 19), ('@petersagal', 19), ('@muffymarracco', 18), ('@mrtimlong', 16), ('@blainecapatch', 15), ('@1followernodad', 14), ('@louisvirtel', 14), ('@apelad', 13), ('@jamesurbania

###### One question I would like to answer is: What insights can we gain from their usernames? 
We gain insights from the most thirty common usernames. Generally, the two rivals interact frequently with each other. For example, @james_holzhauer showed up in Ken's account for 34 times and vise versa. Both of them interacted with @jeopardy very often: James interacted with @jeopardy 70 times compared with 50 times from Ken. To Ken, @omnibusproject is a very important project co-run by Ken and John Roderick. Therefore, we can see @omnibusproject and @johnroderick are the top two most common user mentions. In comparison, James seems to have more attention on Jeopardy because Jeopardy is the top one user mentions. Also Jeames may be a fan of Las Vegas hockey team because the username @goldenknights was mentioned seven times. 