# Connect and harvest data from Twitter

In [1]:
import twitter
import urlparse
import pandas as pd

# parallel print
from pprint import pprint as pp

In [2]:
# Load the twitter API keys
twitter_tokens = pd.read_csv("../twitter_tokens.csv")
twitter_tokens.keys()

Index([u'consumer_key', u' consumer_secret', u'access_token',
       u'access_secret'],
      dtype='object')

In [3]:

class TwitterAPI(object):
    """
        TwitterAPI class allows the Connection to Twitter via OAuth
        once you have registered with Twitter and receive the
        necessary credentials.
    """
    # Initialize key variables and get the twitter credentials
    def __init__(self):
        consumer_key = twitter_tokens.values.flatten()[0]
        consumer_secret = twitter_tokens.values.flatten()[1]
        access_token = twitter_tokens.values.flatten()[2]
        access_secret = twitter_tokens.values.flatten()[3]
        
        self.consumer_key = consumer_key
        self.consumer_secret = consumer_secret
        self.access_token = access_token
        self.access_secret = access_secret
        
    # Authenticate credentials with Twitter using OAuth
        self.auth = twitter.oauth.OAuth(access_token, access_secret, 
                                        consumer_key, consumer_secret)
        
        
    # Create registered Twitter API
        self.api = twitter.Twitter(auth=self.auth)
        
        
    # Search Twitter with query q (i.e "ApacheSpark") and max result
    def searchTwitter(self, q, max_res=10, **kwargs):
        search_results = self.api.search.tweets(q=q, count=10, **kwargs)
        statuses = search_results['statuses']
        max_results = min(1000, max_res)
        
        for _ in range(10):
            try:
                next_results = search_results['search_metadata']['next_results']
            except KeyError as e:
                break
            
            next_results = urlparse.parse_qsl(next_results[1:])
            kwargs = dict(next_results)
            
            search_results = self.api.search.tweets(**kwargs)
            statuses += search_results['statuses']
            
            if len(statuses) > max_results:
                break
            
        return statuses
    
    
    
    # Parse tweets as it is collected to extract ID, creation date, userID, tweet text
    def parseTweets(self, statuses):
        tweetx = [(status['id'],
                   status['created_at'],
                   status['user']['id'],
                   status['user']['name'],
                   url['expanded_url'],
                   status['text']) 
                    for status in statuses 
                      for url in status['entities']['urls']
                 ]
        return tweetx
    

In [4]:
# Instantiate the class with the required authentication
obj = TwitterAPI()

In [5]:
# Run a query on the search tern
twtx = obj.searchTwitter("ApacheSpark")

# Parse the tweets
parsed_tweetx = obj.parseTweets(twtx)

In [37]:
# Display output of parsed tweets
print("Lenth of parsed tweets: {} \n\n".format(len(parsed_tweetx)))

# Serialize the data into CSV
csv_fields = ['id', 'created_at', 'user_id', 'user_name', 'tweet_text', 'url']
tweets_2frames = pd.DataFrame(parsed_tweetx, columns=csv_fields)
tweets_2frames.to_csv("tweets.csv", encoding='utf-8')

# Display first 3 rows
tweets_2frames.ix[:2]

Lenth of parsed tweets: 18 




Unnamed: 0,id,created_at,user_id,user_name,tweet_text,url
0,717857494886912002,Wed Apr 06 23:32:18 +0000 2016,2809658384,AmazingOpenSource,http://snip.ly/wkqtc,RT @Talend: Why #ApacheSpark is Critical to #D...
1,717857388515164162,Wed Apr 06 23:31:53 +0000 2016,1014383496,Steve Tranchida,http://bit.ly/1V6Lxj4,RT @hortonworks: Learn more about #ApacheSpark...
2,717857385834872832,Wed Apr 06 23:31:52 +0000 2016,114666114,Ric,http://bit.ly/1V6Lxj4,RT @hortonworks: Learn more about #ApacheSpark...


In [39]:

tweets_2frames.url[2]

u'RT @hortonworks: Learn more about #ApacheSpark using Scala in this online training class -  https://t.co/GR8Bp7hOvc'