In [1]:
from __future__ import absolute_import, print_function
from tweepy import OAuthHandler, Stream, StreamListener
import pandas as pd
import numpy as np
import json
import os
import io
import re
from datetime import date
from datetime import time
from datetime import datetime

# custome util package
import rtimbroo_utils as rt

In [2]:
# toggle for working with colab
isColab = False

In [3]:
# set global properties
notebook_file_name = 'stream_mine_tweets'
report_file_name = 'stream_mine_tweets'
app_name = 'stream_mine_tweets'
log_level = 10 # 10-DEBUG, 20-INFO, 30-WARNING, 40-ERROR, 50-CRITICAL

# setup working directory structure
# set global properties
if not isColab:
    dataDir = './data'
    outputDir = './output'
    configDir = './config'
    logOutDir = './logs'
    imageDir = './images'
    modelDir = './models'
    corpusDir = './corpus'
else:
    # working within colab
    dataDir = f'{base_dir}data'
    outputDir = f'{base_dir}output'
    configDir = f'{base_dir}config'
    logOutDir = f'{base_dir}logs'
    imageDir = f'{base_dir}images'
    modelDir = f'{base_dir}models'
    corpusDir = f'{base_dir}corpus'

In [4]:
# create base output directories if they don't exist
if not os.path.exists(outputDir): os.mkdir(outputDir)
if not os.path.exists(logOutDir): os.mkdir(logOutDir)
if not os.path.exists(imageDir): os.mkdir(imageDir)
if not os.path.exists(modelDir): os.mkdir(modelDir)
if not os.path.exists(corpusDir): os.mkdir(corpusDir)

In [5]:
# get current date
now = datetime.utcnow().isoformat()
collection_date = re.findall('^[0-9]{4}-[0-9]{2}-[0-9]{2}',now)
collection_date

['2019-11-21']

In [6]:
# get a logger for troubleshooting / data exploration
logger = rt.getFileLogger(logOutDir+'/',app_name+'_'+collection_date[0],level=log_level)

In [7]:
# get current date
now = datetime.utcnow().isoformat()
collection_date = re.findall('^[0-9]{4}-[0-9]{2}-[0-9]{2}',now)
collection_date

['2019-11-21']

In [8]:
class StdOutListener(StreamListener):
    """ A listener handles tweets that are received from the stream.
    """
    max_tweets = 0
    tweet_count = 0
    tweet_filename = ''
    raw_filename = ''
    logger = None
    
    def __init__(self,logger,max_tweets,tweet_filename,raw_filename):
        self.logger = logger
        self.max_tweets = max_tweets
        self.tweet_filename = tweet_filename
        self.raw_filename = raw_filename
        
    #def on_status(self,status):
       # print(f'on_status: Tweet Count: {self.tweet_count}')
        
    def on_data(self, data):
        self.tweet_count+=1
        logger.info(f'on_data: Tweet Count: {self.tweet_count}')
        logger.debug(f'tweet data dump:\n {data}')
        
        try:
        # write the tweet text to file
            with io.open(f'{self.tweet_filename}','a',encoding='utf8') as f:
                logger.debug(f'Opened File {self.tweet_filename} for appending')
                with io.open(f'{self.raw_filename}','a',encoding='utf8') as r:
                    logger.debug(f'Opened File {self.raw_filename} for appending')
                    
                    # load tweet data as json
                    logger.debug(f'Loading Tweet Data as json')
                    raw_tweet=json.loads(data)
                    
                    # get tweet text
                    tweet_text=raw_tweet['text']
                    logger.debug(f'Tweet Text:\n{tweet_text}\n')
                    logger.info(f'Writing Tweet Text to file...')
                    f.write('\n')
                    f.write(tweet_text)
                    
                    logger.debug(f'Dump Raw Tweet Text as json:\n{json.dumps(raw_tweet, sort_keys=True, indent=4)}')
                    logger.info(f'Dumping Raw Tweet to file...')
                    r.write('\n')
                    json.dump(raw_tweet,r)
        
        except BaseException as be:
            logger.warning(f'**WARNING** Caught Exception writting tweet to file: {be}')
            pass
        
        if self.tweet_count >= self.max_tweets:
            logger.warning(f'Max Tweets Reached: {self.max_tweets}')
            return False
        
        return True

    def on_error(self, status):
        logger.error("ERROR")
        if(status==420):
            logger.error(f'**ERROR** Rate Limited Reached | status:{status}')

In [9]:
# load twitter credentials
with open(f'{configDir}/twitter_credentials.json', 'r') as f:
    tw_cred = json.load(f)

# setup base twitter search query
#search_terms = 'artificial+intelligence OR machine+learning'
#search_start_date = '2019-10-13'
# add filters to search criteria
#filtered_search_terms = search_terms + " -filter:retweets"
# number of tweets to return
#num_tweets = 10000

# query
#base_tw_query = {
#    'q':filtered_search_terms,
#    'since':search_start_date,
#    'count':num_tweets,
#    'lang':'en',  
#}

In [10]:
player = 'deshaun_watson'
#tweets_date_start='2019-11-04'
max_tweets=1000
corpusPath = f'{corpusDir}/{player}/{collection_date}'
if not os.path.exists(corpusPath): os.makedirs(corpusPath)
if not os.path.exists(f'{corpusPath}/{player}_tweet_text.txt'): open(f'{corpusPath}/{player}_tweet_text.txt', 'a').close()
if not os.path.exists(f'{corpusPath}/{player}_tweet_raw.txt'): open(f'{corpusPath}/{player}_tweet_raw.txt', 'a').close()


tweet_filename=f'{corpusPath}/{player}_tweet_text.txt'
raw_filename=f'{corpusPath}/{player}_tweet_raw.txt'
#search_filter='deshaun watson'

try:

    l = StdOutListener(logger,max_tweets,tweet_filename,raw_filename)
    auth = OAuthHandler(tw_cred['CONSUMER_KEY'], tw_cred['CONSUMER_SECRET'])
    auth.set_access_token(tw_cred['ACCESS_TOKEN'], tw_cred['ACCESS_SECRET'])

    stream = Stream(auth, l)
    stream.filter(track=['deshaun watson'], is_async=True)
    
except OSError as ose:
    logger.error(f'**ERROR** caught exception: {ose}')
    # try connecting again
    l = StdOutListener(logger,max_tweets,tweet_filename,raw_filename)
    auth = OAuthHandler(tw_cred['CONSUMER_KEY'], tw_cred['CONSUMER_SECRET'])
    auth.set_access_token(tw_cred['ACCESS_TOKEN'], tw_cred['ACCESS_SECRET'])

    stream = Stream(auth, l)
    stream.filter(track=['deshaun watson'], is_async=True)
    
    
    
    
    
    

on_data: Tweet Count: 1
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 2
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 3
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 4
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 5
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 6
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 7
--- Logging error ---
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\logging\__init__.py", line 1037, in emit
    stream.write(msg + self.terminator)
  File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 65-68: character maps to <undefined>
Call stack:
  File "C:\ProgramData\Anaconda3\l

on_data: Tweet Count: 33
--- Logging error ---
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\logging\__init__.py", line 1037, in emit
    stream.write(msg + self.terminator)
  File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 104-105: character maps to <undefined>
Call stack:
  File "C:\ProgramData\Anaconda3\lib\threading.py", line 885, in _bootstrap
    self._bootstrap_inner()
  File "C:\ProgramData\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
    self.run()
  File "C:\ProgramData\Anaconda3\lib\threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tweepy\streaming.py", line 289, in _run
    self._read_loop(resp)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tweepy\streaming.py", line 351, in _read_loop

Dumping Raw Tweet to file...
on_data: Tweet Count: 45
--- Logging error ---
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\logging\__init__.py", line 1037, in emit
    stream.write(msg + self.terminator)
  File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 73-74: character maps to <undefined>
Call stack:
  File "C:\ProgramData\Anaconda3\lib\threading.py", line 885, in _bootstrap
    self._bootstrap_inner()
  File "C:\ProgramData\Anaconda3\lib\threading.py", line 917, in _bootstrap_inner
    self.run()
  File "C:\ProgramData\Anaconda3\lib\threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tweepy\streaming.py", line 289, in _run
    self._read_loop(resp)
  File "C:\ProgramData\Anaconda3\lib\site-packages\tweepy\streaming.p

Dumping Raw Tweet to file...
on_data: Tweet Count: 94
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 95
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 96
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 97
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 98
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 99
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 100
Writing Tweet Text to file...
Dumping Raw Tweet to file...
on_data: Tweet Count: 101
--- Logging error ---
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\logging\__init__.py", line 1037, in emit
    stream.write(msg + self.terminator)
  File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' cod