# Twitter Goggles Lite

Query the the Twitter Search API and return a structured table of results.

In [None]:
import numpy as np 
import pandas as pd

import argparse, collections, configparser, io, json, math, mysql.connector as sql, os, requests, sys, time
from datetime import datetime
from mysql.connector import errorcode
from requests import HTTPError
from requests import ConnectionError
from requests_oauthlib import OAuth1

The following script queries Twitter for tweets containing the name of 50 US cities in the body of the text as well as tweets withing a 40 km radius of that city. The 50 cities are listed within the `description` column of the `jobs.csv` file below and the query request are in the `query` column. 

In [None]:
job = pd.read_csv('data/job.csv')
job.head()

In order to use the Search API, Twitter requires you to have authetication. Be sure to fill in the variables below with your Twitter creds, then run the cell.

In [None]:
consumer_key = 'H0pa8ePaLE9ZjRJp0BfFi5JVt'
consumer_secret = 'u2w1U1JccIrd5Zhy8O9ALPgdt1Z3xPyD3kJP8E8Q2AGde0jXpv'
access_token = '2882452026-qkvdVWqbNwJYktehDXlcensgRK6lZJG6cyWMPXm'
access_token_secret = '2V8jupOOtRYoNIkRLNewxc06vAuStj2ZcWAKa3fsZZUBw'


oauth = OAuth1(client_key=consumer_key,
    client_secret=consumer_secret,
    resource_owner_key=access_token,
    resource_owner_secret=access_token_secret)

To make a request to Twitter, we define a function that queries Twitter and returns a json object that we will then parse into dictionaries for the attributes that we are interested in.

In [None]:
def search(query, oauth) :
    print("Query: " + query)
    
    attempt = 1
    while attempt <= 3 :
        try :
            r = requests.get("https://api.twitter.com/1.1/search/tweets.json?" + query, auth=oauth)
            return json.loads(r.text)
        
        except (ConnectionError, HTTPError) as err :
            sleep_time = 2**(attempt - 1)
            print("Connection attempt " + str(attempt) + " failed. "
                "Sleeping for " + str(sleep_time) + " second(s).")
            time.sleep(sleep_time)
            attempt = attempt + 1
    
    print("***** Error: Unable to query Twitter. Terminating.")

The function below just ensures that our query begins with "q=" as this is the format twitter requires. All of the jobs in the job table already have a "q=" but this acts as just a fail-safe. 

In [None]:
def getFullQuery(query) :
    if (not query.startswith("q=")) :
        query = "q=" + query
    return query

And now we structure our data! 

In [None]:
def structureTweetTable(tweet, job):
   
    list_o_dicts = []
    
    for i in tweet['statuses']:
        d = {}
        d['tweet_id_str'] =  i["id_str"] 
        d['job_id'] = job
        d['created_at'] = datetime.strptime(i["created_at"], '%a %b %d %H:%M:%S +0000 %Y').strftime('%Y-%m-%d %H:%M:%S')
        d['text'] = i["text"]
        d['from_user'] = i["user"]["id"]
        d['from_user_id_str'] = i["user"]["id_str"]
        d['from_user_name']  = i["user"]["screen_name"]
        d['from_user_fullname'] = i["user"]["name"]
        d['from_user_created_at'] = datetime.strptime(i["user"]["created_at"], '%a %b %d %H:%M:%S +0000 %Y').strftime('%Y-%m-%d %H:%M:%S')
        d['from_user_followers'] = i["user"]["followers_count"]
        d['from_user_following'] = i["user"]["friends_count"]
        d['from_user_favorites'] = i["user"]["favourites_count"]
        d['from_user_tweets'] = i["user"]["statuses_count"]
        d['from_user_timezone'] = i["user"]["time_zone"]
        d['to_user'] = i["in_reply_to_user_id"]
        d['to_user_id_str'] = i["in_reply_to_user_id_str"]
        d['to_user_name'] = i["in_reply_to_screen_name"]
        d['source'] = i["source"]
        d['iso_language'] = i["metadata"]["iso_language_code"]
        if i['geo'] is not None and i['geo']['type'] == "Point" :
            d['location_geo'] = 'Point({},{})'.format(i['geo']["coordinates"][0],i['geo']["coordinates"][1])
            d['location_geo_0'] = '{}'.format(i['geo']["coordinates"][0])
            d['location_geo_1'] = '{}'.format(i['geo']["coordinates"][1])
        
        list_o_dicts.append(d)

    return list_o_dicts
        

And now we put it all together and collect tweets from each job once. Each job run becomes a data frame that will then be appended to the list object `frames`.

In [None]:
frames = []
for index, row in job.iterrows():
    
    job_id = row['job_id']

    q = getFullQuery(row['query'])
    results = search(q,oauth)
    try:
        f = pd.DataFrame(structureTweetTable(results,job_id))
        frames.append(f)
    except:
        print("   no results returned for '{}'".format(row['query']))
        pass

    

In [None]:
frame = pd.concat(frames)

### Hashtag Example

In [None]:
# Your Code (create as many cells as you need)
# --------------------------------------------
alls = []
for i, row in frame.iterrows():
    stuff = {tag.strip("#") for tag in row['text'].split() if tag.startswith("#")}
    if len(stuff) > 0:
        tweets = [[i] * len(stuff)]
        jobs = [[row['job_id']] * len(stuff)]
        f = pd.DataFrame({'tweet_id' : tweets[0], 'hastag' : list(stuff), 'job_id': jobs[0]})
        alls.append(f)
    



In [None]:
pd.concat(alls).reset_index().drop('index', axis = 1)

In [None]:
frame