In [1]:
import json
import datetime
import pytz
from tzwhere import tzwhere
tzwhere_ = tzwhere.tzwhere()
import pandas as pd

## Things i want to save:
* timestamp
* geo location
* retweet yes/no
    * retweeted from whom?
* reply yes/no 
    * replied to whom?
* tweet text
* has url?
* has media attached?
* has hashtag attached?

In [16]:
def check_hashtag(single_tweet):
    '''check whether tweet has any hashtags'''
    return 'hashtags' in single_tweet['entities'].keys()

def check_media(single_tweet):
    '''check whether tweet has any media attached'''
    return 'media' in single_tweet['entities'].keys()

def check_url(single_tweet):
    '''check whether tweet has any urls attached'''
    return 'urls' in single_tweet['entities'].keys()

def check_retweet(single_tweet):
    '''
    check whether tweet is a RT. If yes:
    return name & user name of the RT'd user.
    otherwise just return nones
    '''
    if 'retweeted_status' in single_tweet.keys():
        return (single_tweet['retweeted_status']['user']['screen_name'],
                single_tweet['retweeted_status']['user']['name'])
    else:
        return (None,None)
    
def check_coordinates(single_tweet):
    '''
    check whether tweet has coordinates attached.
    if yes return the coordinates
    otherwise just return nones
    '''
    if 'coordinates' in single_tweet['geo'].keys():
        return (single_tweet['geo']['coordinates'][0],
                single_tweet['geo']['coordinates'][1])
    else:
        return (None,None)

def check_reply_to(single_tweet):
    '''
    check whether tweet is a reply. If yes:
    return name & user name of the user that's replied to.
    otherwise just return nones
    '''
    if 'in_reply_to_screen_name' in single_tweet.keys():
        name = None
        for user in single_tweet['entities']['user_mentions']:
            if user['screen_name'] == single_tweet['in_reply_to_screen_name']:
                name = user['name']
                break
        return (single_tweet['in_reply_to_screen_name'],name)
    else:
        return (None,None)

def convert_time(coordinates,time_utc):
    '''
    Does this tweet have a geo location? if yes 
    we can easily convert the UTC timestamp to true local time!
    otherwise return nones
    '''
    if coordinates[0] and coordinates[1]:
        timezone_str = tzwhere_.tzNameAt(coordinates[0],coordinates[1])
        timezone = pytz.timezone(timezone_str)
        time_obj_local = datetime.datetime.astimezone(time_utc,timezone)
        return time_obj_local
    
def create_dataframe(tweets):
    '''
    create a pandas dataframe from our tweet jsons
    '''
    
    # initalize empty lists
    utc_time = []
    longitude = []
    latitude = []
    local_time = []
    hashtag = []
    media = []
    url = []
    retweet_user_name = []
    retweet_name = []
    reply_user_name = []
    reply_name = []
    text = []
    # iterate over all tweets and extract data
    for single_tweet in tweets:
        utc_time.append(datetime.datetime.strptime(single_tweet['created_at'],'%Y-%m-%d %H:%M:%S %z'))
        coordinates = check_coordinates(single_tweet)
        latitude.append(coordinates[0])
        longitude.append(coordinates[1])
        local_time.append(convert_time(coordinates,utc_time[-1]))
        hashtag.append(check_hashtag(single_tweet))
        media.append(check_media(single_tweet))
        url.append(check_url(single_tweet))
        retweet = check_retweet(single_tweet)
        retweet_user_name.append(retweet[0])
        retweet_name.append(retweet[1])
        reply = check_reply_to(single_tweet)
        reply_user_name.append(reply[0])
        reply_name.append(reply[1])
        text.append(single_tweet['text'])
    # convert the whole shebang into a pandas dataframe
    dataframe = pd.DataFrame(data= {
                    'utc_time' : utc_time,
                    'local_time' : local_time,
                    'latitude' : latitude,
                    'longitude' : longitude,
                    'hashtag' : hashtag,
                    'media' : media,
                    'url' : url,
                    'retweet_user_name' : retweet_user_name,
                    'retweet_name' : retweet_name,
                    'reply_user_name' : reply_user_name,
                    'reply_name' : reply_name,
                    'text' : text
    })
    return dataframe

In [14]:
with open('2017_04.js') as f:
    d = f.readlines()[1:]
    d = "".join(d)
    tweets = json.loads(d)

tweet_df = create_dataframe(tweets)

In [15]:
tweet_df

Unnamed: 0,hashtag,latitude,local_time,longitude,media,reply_name,reply_user_name,retweet_name,retweet_user_name,text,url,utc_time
0,True,43.679166,2017-04-30 17:05:19-04:00,-79.611511,True,,,,,Who’ll be stuck for a night ORD for a night be...,True,2017-04-30 21:05:19+00:00
1,True,,,,True,,,Scholarly Commons,ScholrlyCommons,RT @ScholrlyCommons: Just launced the https://...,True,2017-04-30 17:02:14+00:00
2,True,43.642920,2017-04-30 12:48:53-04:00,-79.384042,True,⓪ Grⓐhⓐm Steel 🔬🎓,McDawg,,,@McDawg @MsPhelps @jeroenbosman https://t.co/c...,True,2017-04-30 16:48:53+00:00
3,True,43.642904,2017-04-30 12:44:39-04:00,-79.383998,True,,,,,Hear @MsPhelps and @jeroenbosman talk about th...,True,2017-04-30 16:44:39+00:00
4,True,43.643080,2017-04-29 17:33:29-04:00,-79.383782,True,Nyborg,ny_borg,,,@ny_borg I hear you https://t.co/JnUo43jh54 :D,True,2017-04-29 21:33:29+00:00
5,True,43.643096,2017-04-29 17:25:17-04:00,-79.383810,True,Nyborg,ny_borg,,,"@ny_borg George, not John :)",True,2017-04-29 21:25:17+00:00
6,True,,,,True,,,Jeroen Bosman,jeroenbosman,RT @jeroenbosman: Come play open science accor...,True,2017-04-29 20:38:51+00:00
7,True,,,,True,,,Kirsty Franks,kirstyrachel,RT @kirstyrachel: Collaborative academic publi...,True,2017-04-29 20:38:39+00:00
8,True,,,,True,,,Open Culture,openculture,RT @openculture: The First 100 Days of Fascist...,True,2017-04-29 20:04:50+00:00
9,True,,,,True,,,Biⓐnca Kramer,MsPhelps,RT @MsPhelps: #ccsummit Come and join us for a...,True,2017-04-29 18:21:26+00:00
