# Text Mine Twitter - NFL Search Patterns


Ryan Timbrook (RTIMBROO)  
DATE:11/30/2019 <br>
Topic: Search Twitter for tweets on specific NFL Players, Coaches, and Teams


## 1. Objective
_____________________________________________________________________________________________
Capture popular opinion of peoples tweets on certain NFL characters. 
Create a corpus of tweets for sentiment analysis


______________________________________________________________________________________________
### Coding Environment Setup
Import packages

In [1]:
# import packages for analysis and modeling
import pandas as pd #data frame operations
import numpy as np #arrays and math functions
import requests
import os
import io
import pickle
import re
import sys
from os import path
from datetime import date
from datetime import time
from datetime import datetime

In [2]:
# packages for twitter
import tweepy as tw
from tweepy import OAuthHandler
import json
from tweepy import Stream
from tweepy.streaming import StreamListener

# Twython packages for twitter
from twython import Twython

# packages for NLTK
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer

In [3]:
# custome python packages
import rtimbroo_utils as br             # custome python helper functions

In [4]:
# set global properties
notebook_file_name = 'search_twitter_nfl'
report_file_name = 'search_twitter_nfl'
app_name = 'search_twitter_nfl'
log_level = 10 # 10-DEBUG, 20-INFO, 30-WARNING, 40-ERROR, 50-CRITICAL

# setup working directory structure
# set global properties
dataDir = './data'
outputDir = './output'
configDir = './config'
logOutDir = './logs'
imageDir = './images'
modelDir = './models'
corpusDir = './corpus'
# create base output directories if they don't exist
if not os.path.exists(outputDir): os.mkdir(outputDir)
if not os.path.exists(logOutDir): os.mkdir(logOutDir)
if not os.path.exists(imageDir): os.mkdir(imageDir)
if not os.path.exists(modelDir): os.mkdir(modelDir)
if not os.path.exists(dataDir): os.mkdir(dataDir)
if not os.path.exists(configDir): os.mkdir(configDir)
if not os.path.exists(corpusDir): os.mkdir(corpusDir)

In [5]:
# get a logger for troubleshooting / data exploration
logger = br.getFileLogger(logOutDir+'/',app_name,level=log_level)

In [6]:
# get current date
now = datetime.utcnow().isoformat()
collection_date = re.findall('^[0-9]{4}-[0-9]{2}-[0-9]{2}',now)
collection_date = collection_date[0]
collection_date

'2019-12-01'

## 2. OBTAIN the data   
________________________________________________________________________________________________
Import external datasets for evaluation

#### Twitter Search API Limits:
[Sandbox Package](https://developer.twitter.com/en/pricing/search-fullarchive)

* Time frame:	Full history
* Tweets per request:	100
* Counts vs. data:	Data only
* Query length:	128 characters
* Operator availability:	Standard
* Rate limit per minute:	30 requests/min
* Enrichments:	n/a
* Dev environments:	1	
* Monthly Tweet cap:	5k	
* Rate limit per second: 10 requests/sec

### Instantiate Twitter API Object
Using Twython 3.6 Twitter API Wrapper
[Twython 3.6.0 reference documentation](https://twython.readthedocs.io/en/latest/api.html)

In [7]:
# load twitter credentials
with open(f'{configDir}/twitter_credentials.json', 'r') as f:
    tw_cred = json.load(f)

# setup client header arguments to pass along to the API
client_args = {
    'headers':{
        'User-Agent': 'AI_Public_Sentiment_16860838'
    },
    'timeout':300,
    
}
    
# instantiate object
py_tweets = Twython(tw_cred['CONSUMER_KEY'],
                    tw_cred['CONSUMER_SECRET'],
                    tw_cred['ACCESS_TOKEN'],
                    tw_cred['ACCESS_SECRET'],
                   client_args=client_args)

logger.debug(f'{py_tweets.verify_credentials()}')
logger.debug(f'{py_tweets.get_home_timeline()}')

### Configure Search Terms

In [8]:
# set what to search on
nfl_type = 'coach'
search_on = 'bill_obrien'

# setup base twitter search query
search_terms="bill obrien"+" "+"houston texans"

# add filters to search criteria
filtered_search_terms = search_terms + " -filter:retweets"

#search_start_date = '2019-11-23' # limits to the last 7 days

# number of tweets to return
num_tweets = 100 # sandbox rate limit - 100 tweets per request

In [9]:
# search dates (from_date, to_date) - Saturday through Sunday
search_date_ranges = [
    ('2019-10-27','2019-11-3'),
    ('2019-11-3','2019-11-10'),
    ('2019-11-10','2019-11-17'),
    ('2019-11-17','2019-11-24'),
    ('2019-11-24','2019-12-1')
                ]

In [10]:
def convert_str_date(str_date):
    import time
    day_of_week = str_date.split(' ')[0]
    month = str_date.split(' ')[1]
    day_of_month = str_date.split(' ')[2]
    year = str_date.split(' ')[-1]
    time_of_day = str_date.split(' ')[3]


    new_str_date = f'{month} {day_of_month}, {year}'
    ts = time.strptime(new_str_date, '%b %d, %Y')
    new_ds_str = f'{ts.tm_year}-{ts.tm_mon}-{ts.tm_mday}'

    return new_ds_str, time_of_day
    

In [11]:
'''
Function Description: 
'''
def config_query(search_term,since=None,until=None,count=100,lang='en',result_type='mixed'):
    
    # query
    search = {
        'q':search_term,
        'since':since,              # from_date
        'until':until,              # Date format YYYY-MM-DD - returns tweets created before the given date
        'lang':lang,
        'result_type':result_type,    # mixed, recent, popular
        'count':count,                # max is 100, defult is 15 per page

        #'since_id': ,              # returns results with an ID more recent than the specified ID - if the limit of Tweets has occured since the since_id, the since_id will be forced to the oldest ID available
        #'max_id': ,                # returns results with an ID older than or equal to the specified ID
    }
    
    logger.debug(f'config_query: search:\n{search}')
    return search

In [12]:
'''
Function Description: 
'''
def page_search(twitter,query,text_file,raw_file):
    results = twitter.cursor(twitter.search,**query, return_pages=True)
    # search tweets
    tweets_dict = {'id':[],'created_at':[],'date':[],'time':[],'user':[],'text':[],'favorite_count':[]}
    page_cnt = 0
    result_cnt = 0
    with io.open(f'{text_file}', 'a',encoding='utf8') as f:
        with io.open(f'{raw_file}','a',encoding='utf8') as r:
            try: 
                # page is a list of twitter results
                for i, page in enumerate(results):
                    page_cnt +=1
                    logger.info(f'Page: [{i}]')
                    try:
                        for j,result in enumerate(page):
                            #logger.info(f'result type:{type(result)}')
                            #break
                            
                            result_cnt += 1
                            logger.info(f'Result: [{j}]')
                            try:
                                logger.debug(f'{result["id_str"]} | {result["user"]["screen_name"]} | {result["created_at"]} | {result["text"]} | {result["user"]["favourites_count"]}')

                                # dump raw tweet to file as json
                                #raw_tweet = json.load(result)
                                dump = json.dumps(result)
                                r.write(dump)
                                r.write('\n')
                                
                                # dump tweet text to file
                                f.write(f'{result["id_str"]} {result["text"]}')
                                f.write('\n')
                                
                                # add key attributes to tweets dictionary as return results
                                tweets_dict['id'].append(result["id_str"])
                                tweets_dict['created_at'].append(result["created_at"])
                                tweets_dict['date'].append(convert_str_date(result["created_at"])[0])
                                tweets_dict['time'].append(convert_str_date(result["created_at"])[1])        
                                tweets_dict['user'].append(result["user"]["screen_name"])
                                tweets_dict['text'].append(result["text"])
                                tweets_dict['favorite_count'].append(result["user"]["favourites_count"])
                                       
                                #break

                            except BaseException as be:
                                logger.warning(f'**WARNING** Caught BaseException: {be}')
                                             
                    except BaseException as be:
                        logger.warning(f'**WARNING** Caught BaseException: {be}')
                    #break
            except BaseException as be:
                logger.warning(f'**WARNING** Caught BaseException: {be}')
    
    logger.info(f'page_search: processed page_cnt:[{page_cnt}] | total result_cnt: [{result_cnt}]')
    
    return pd.DataFrame.from_dict(tweets_dict)
    

## Execute Twitter Search

[Twython 3.6.0 reference documentation](https://twython.readthedocs.io/en/latest/api.html)

In [13]:
'''
Execute Twitter search by pre-configured date ranges
'''
search_range_results_df = pd.DataFrame()
# execute search by date ranges
for dates in search_date_ranges:
    search_range = f'{dates[0]}_{dates[1]}'
    logger.info(f'search_range: {search_range}')
    
    # output file names based on date range search
    outputPath = f'{dataDir}/{nfl_type}/{search_on}/{search_range}'
    if not os.path.exists(outputPath): os.makedirs(outputPath)
        
    tweet_filename=f'{outputPath}/tweet_text.txt'
    raw_filename=f'{outputPath}/tweet_raw.txt'
        
    if not os.path.exists(f'{tweet_filename}'): open(f'{tweet_filename}', 'a').close()
    if not os.path.exists(f'{raw_filename}'): open(f'{raw_filename}', 'a').close()
    
    
    # configure query by dates
    query = config_query(filtered_search_terms,since=dates[0],until=dates[1],count=100)
    #break
    
    result_df = page_search(py_tweets,query,tweet_filename,raw_filename) 
    
    logger.debug(f'search result df: \n{result_df}')
    # merge dataframes
    search_range_results_df = search_range_results_df.append(result_df, ignore_index=True)
    

search_range: 2019-10-27_2019-11-3
Page: [0]
page_search: processed page_cnt:[1] | total result_cnt: [0]
search_range: 2019-11-3_2019-11-10
Page: [0]
page_search: processed page_cnt:[1] | total result_cnt: [0]
search_range: 2019-11-10_2019-11-17
Page: [0]
page_search: processed page_cnt:[1] | total result_cnt: [0]
search_range: 2019-11-17_2019-11-24
Page: [0]
Result: [0]
Page: [1]
page_search: processed page_cnt:[2] | total result_cnt: [1]
search_range: 2019-11-24_2019-12-1
Page: [0]
Result: [0]
Result: [1]
Result: [2]
Result: [3]
Result: [4]
Result: [5]
Result: [6]
Result: [7]
Result: [8]
Result: [9]
Result: [10]
Result: [11]
Result: [12]
Result: [13]
Result: [14]
Result: [15]
Result: [16]
Result: [17]
--- Logging error ---
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\logging\__init__.py", line 1037, in emit
    stream.write(msg + self.terminator)
  File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(i

In [20]:
search_range_results_df.head()

Unnamed: 0,id,created_at,date,time,user,text,favorite_count
0,1198337897507803142,Sat Nov 23 20:29:53 +0000 2019,2019-11-23,20:29:53,pfrumors,The All-Pro safety was briefly connected to th...,3561.0
1,1200324875115810816,Fri Nov 29 08:05:25 +0000 2019,2019-11-29,08:05:25,TimP103,Bill O'Brien is the most successful ex-Bill Be...,11.0
2,1200086820928622592,Thu Nov 28 16:19:29 +0000 2019,2019-11-28,16:19:29,STERLING9798,"As always, Patriots stand in the way for Bill ...",360.0
3,1200023580542676993,Thu Nov 28 12:08:11 +0000 2019,2019-11-28,12:08:11,monkey_viral,"#As always, Patriots stand in the way for Bill...",569.0
4,1199968219286687745,Thu Nov 28 08:28:12 +0000 2019,2019-11-28,08:28:12,SVOFL,"As always, Patriots stand in the way for Bill ...",34.0


In [21]:
logger.info(f'search_range_results_df shape: {search_range_results_df.shape}')

search_range_results_df shape: (26, 7)


In [22]:
py_tweets.get_lastfunction_header('x-rate-limit-remaining')

'9'

In [23]:
py_tweets.get_home_timeline()
py_tweets.get_lastfunction_header('x-rate-limit-remaining')

'8'

## Save Full DataFrame of search results to csv

In [24]:
outputPath = f'{dataDir}/{nfl_type}/{search_on}'
search_range_results_df.to_csv(f'{outputPath}/search-result_twitter_text_data.csv', index=False)