# Connect and harvest data from Twitter

* Logging capability using `Python logging library` to collect any errors or warning in the case of program failure
* Data Persistence capability using MongoDB with the `IO_Mongo` class as well as JSON file.
* API rate limit and error management capability so we can ensure more resilient calls to Twitter without getting barred for tapping into the firehose.

In [19]:
import os, io, json
import logging
import twitter
import urlparse
import pandas as pd
from pymongo import MongoClient as MCli

# Load the twitter API keys
twitter_tokens = pd.read_csv("../twitter_tokens.csv")
twitter_tokens.keys()

Index([u'consumer_key', u' consumer_secret', u'access_token',
       u'access_secret'],
      dtype='object')

### Create JSON I/O

In [None]:
class IO_json(object):
    """ Use pandas dataframe to create, insert, load json data."""
    def __init__(self, filepath, filename):
        self.filepath = filepath
        self.filename = filename
        self.file_io = os.path.join(filepath, filename)
        
        
    def save(self, data):
        """ Save data as Pandas dataframe and convert to json. Check if file already exist..."""
        if os.path.isfile('{0}/{1}.json'.format(self.filepath, self.filename)):
            # If file exists, append data to the file
            with io.open('{0}/{1}.json'.format(self.filepath, self.filename), 'a', encoding='utf-8') as f:
                f.write(unicode(json.dumps(data, ensure_ascii=False)))
                #                 f.write(json.dumps(data, ensure_ascii=False))           # No unicode in PYTHON 3
        else:
            # Create new file if .json file does not exist
            with io.open('{0}/{1}.json'.format(self.filepath, self.filename), 'w', encoding='utf-8') as f:
                f.write(unicode(json.dumps(data, ensure_ascii=False)))
                #                 f.write(json.dumps(data, ensure_ascii=False))           # No unicode in PYTHON 3
                
                
    
    # Load method returns the file that has been read.
    def load(self):
        with io.open('{0}/{1}.json'.format(self.filepath, self.filename), encoding='utf-8') as f:
            return f.read()
            
            

### Test the `IO_JSON` class

In [68]:
# Read sample JSON file
sample_json = '''{
"Country":
    {"0":"Algeria","1":"Angola","2":"Benin","3":"Botswana","4":"Burkina"},
"Region":
 {"0":"AFRICA","1":"AFRICA","2":"AFRICA","3":"AFRICA","4":"AFRICA"}
}'''

# Save the JSON file or create if not exist
jsn  = IO_json('json_data', 'json_test_file')    
jsn.save(sample_json)

# Append to existing JSON file
# jsn.load()

### Create MongoDB Connections

In [69]:
class IO_Mongo(object):
    """Connect to the mongo server on localhost at port 27017."""
    conn={'host':'localhost', 'ip':'27017'}


    # Initialize the class with client connection, the database (i.e. twtr_db), and the collection (i.e. twtr_coll)
    def __init__(self, db='twtr_db', coll='twtr_coll', **conn):
        """Connect to the MonfoDB server"""
        self.client = MCli(**conn)
        self.db = self.client[db]
        self.coll = self.db[coll]


    # The `save` method inserts new records in the pre_initialized collection and database
    def save(self, data):
        """ Insert data to collection in db. """
        return self.coll.insert(data)
    
    
    # The `load` method allows the retrieval of specific records
    def load(self, return_cursor=False, criteria=None, projection=None):
        """ The `load` method allows the retrieval of specific records according to criteria and projection. 
            In case of large amount of data, it returns a cursor.
        """
        if criteria is None:
            criteria = {}
        
        # Find record according to some criteria.
        if projection is None:
            cursor = self.coll.find(criteria)
        else:
            cursor = self.coll.find(criteria, projection)
        
        # Return a cursor for large amount of data
        if return_cursor:
            return cursor
        else:
            return [item for item in cursor]
        
        

### Initialize by instantiating the `Twitter API` with our credentials

In [78]:

class TwitterAPI(object):
    """
        TwitterAPI class allows the Connection to Twitter via OAuth
        once you have registered with Twitter and receive the
        necessary credentials.
    """
    # Initialize key variables and get the twitter credentials
    def __init__(self):
        consumer_key = twitter_tokens.values.flatten()[0]
        consumer_secret = twitter_tokens.values.flatten()[1]
        access_token = twitter_tokens.values.flatten()[2]
        access_secret = twitter_tokens.values.flatten()[3]
        
        self.consumer_key = consumer_key
        self.consumer_secret = consumer_secret
        self.access_token = access_token
        self.access_secret = access_secret
    
    # Set number of retries
        self.retries = 3
        
    # Authenticate credentials with Twitter using OAuth
        self.auth = twitter.oauth.OAuth(access_token, access_secret, consumer_key, consumer_secret)
        
    # Create registered Twitter API
        self.api = twitter.Twitter(auth=self.auth)
        
# Initialize the Logger by providing the log level
# logger.debug(debug message), logger.info(info message), 
# logger.warn(warn message), logger.critical(critical message)

    # Logger initialization
        appName = 'twt150530'
        self.logger = logging.getLogger(appName)
        
        # self.logger.setLevel(logging.DEBUG)
        # create console handler and set level to debug
        logPath = './log_data'
        fileName = appName
        fileHandler = logging.FileHandler("{0}/{1}.log".format(logPath, fileName))
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        fileHandler.setFormatter(formatter)
        self.logger.addHandler(fileHandler)
        self.logger.setLevel(logging.DEBUG)
        
        
    # initialize the JSON file persistence isntruction
        jsonF_path = './json_data'
        jsonF_name = 'twt15053001'
        self.jsonSaver = IO_json(jsonF_path, jsonF_name)
        
    
    # Initialize the MongoDB database and collections for persistence
        self.mongoSaver = IO_Mongo(db='twtr01_db', coll='twtr01_coll')
        
    
        
    # Search Twitter with query q (i.e "ApacheSpark") and max result
    def searchTwitter(self, q, max_res=10, **kwargs):
        search_results = self.api.search.tweets(q=q, count=10, **kwargs)
        statuses = search_results['statuses']
        max_results = min(1000, max_res)
        
        for _ in range(10):
            try:
                next_results = search_results['search_metadata']['next_results']
            except KeyError as e:
# Add new logging update
                self.logger.error("Error in searchTwitter: {}".format(e))
                break
            
            next_results = urlparse.parse_qsl(next_results[1:])
            kwargs = dict(next_results)
            
            search_results = self.api.search.tweets(**kwargs)
            statuses += search_results['statuses']
            self.saveTweets(search_results['statuses'])
            
            if len(statuses) > max_results:
                self.logger.info('info in searchTwitter - got {} tweets - max: {}'.format(len(statuses), max_results))
                break
            
        return statuses
    
    
#   The saveTweets method actually saves the collected tweets in JSON and in MongoDB:
    def saveTweets(self, statuses):
        # Saving to JSON File
        self.jsonSaver.save(statuses)
        
        # Saving to MongoDB
        for s in statuses:
            self.mongoSaver.save(s)
    
    
    # Parse tweets as it is collected to extract ID, creation date, userID, tweet text
    def parseTweets(self, statuses):
        tweetx = [(status['id'],
                   status['created_at'],
                   status['user']['id'],
                   status['user']['name'],
                   status['text']['text'],
                   url['expanded_url']) 
                    for status in statuses 
                      for url in status['entities']['urls']
                 ]
        return tweetx
    
    
    
    # The getTweets method calls the searchTwitter method described previously. 
    # The getTweets method ensures that API calls are made reliably whilst respecting the imposed rate limit. 
    def getTweets(self, q, max_res=10):
        pass
    

In [79]:
pd.DataFrame([('a',1), ('b',5), ('c',3)])

Unnamed: 0,0,1
0,a,1
1,b,5
2,c,3
