In [1]:
from glob import glob
import json
import re
import yaml
import os
import shutil
import time
os.chdir('../')

from app.api.couch_db import TweetsDB, TwitterUsersDB
from app.api.task_helpers import GeoAnalyser, SentimentAnalyser, UsersAnalyser, TopicsAnalyser

with open("config.yaml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile)
    ymlfile.close()

queues_cfg = cfg['QUEUES']
tweets_queue_path = queues_cfg['new_tweets']
processed_tweets_path = queues_cfg['processed_tweets']
sa_queue_path = queues_cfg['sentiment_tasks']
geo_queue_path = queues_cfg['geo_tasks']
users_tasks_path = queues_cfg['user_tweets_tasks']
topic_tasks_path = queues_cfg['topic_tasks']

# Process Downloaded Tweets Queue

In [3]:
def main():
    #os.chdir('../')
    with open("config.yaml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile)
        ymlfile.close()

    queues_cfg = cfg['QUEUES']
    tweets_queue_path = queues_cfg['new_tweets']
    processed_tweets_path = queues_cfg['processed_tweets']
    sa_queue_path = queues_cfg['sentiment_tasks']
    geo_queue_path = queues_cfg['geo_tasks']
    users_tasks_path = queues_cfg['user_tweets_tasks']
    topic_tasks_path = queues_cfg['topic_tasks']


    def complete_task(tweet, path):
        filename = path.split('/')[-1]
        #new_path = '{}/{}'.format(processed_tweets_path, filename)
        #shutil.(path, new_path)
        os.remove(path)

    analysers = [
                    GeoAnalyser(geo_queue_path),
                    SentimentAnalyser(sa_queue_path),
                    UsersAnalyser(users_tasks_path, cfg),
                    TopicsAnalyser(topic_tasks_path)
                ]
    geo_analyser = analysers[0]
    couch_db = TweetsDB(cfg['COUCHDB'])
        

    def tweet_within_bounding_box(tweet_data):
        if tweet_data['coordinates'] is not None:
            coordinates = tweet_data['coordinates']
        elif tweet_data['place'] is not None:
            coordinates = tweet_data['place']['bounding_box']
        else:
            return False        
        return geo_analyser.australia_check(coordinates['coordinates'])


    i = 1
    while True:
        unprocessed_tweets = glob('{}/*.json'.format(tweets_queue_path))[:1000]
        for path in unprocessed_tweets:
            try:
                with open(path, 'r') as fp:
                    tweet_json = json.load(fp)
                    
                    if tweet_within_bounding_box(tweet_json):
                        couch_db.save_tweet(tweet_json)
                        tweet_id = tweet_json['id_str']

                        for analyser in analysers:
                            analyser.append_task(tweet_id, tweet_json)
                    else:
                        print('Tweet {} is outside of Australia. Skipping it.'.format(tweet_id))
                    complete_task(tweet_json, path)
            except Exception as e:
                print('Enexpected Error: {}'.format(e))

        print('Iteration: {}\tFiles processed: {}'.format(i, len(unprocessed_tweets)))
        i+=1
        time.sleep(5)

if __name__ == "__main__":
    main()

Enexpected Error: local variable 'tweet_id' referenced before assignment
New task was queued to ../shared_folder/geo_analyser_tasks_prod/982066088333602818.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/982066088333602818.task.txt
71489542 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/982066088333602818.task.txt
Tweet 982066088333602818 is outside of Australia. Skipping it.
Tweet 982066088333602818 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/971333422600761345.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/971333422600761345.task.txt
925145424700125184 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/971333422600761345.task.txt
Tweet 971333422600761345 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/914737713189814272.task.txt
New task was queue

New task was queued to ../shared_folder/geo_analyser_tasks_prod/808659443504349184.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/808659443504349184.task.txt
347395561 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/808659443504349184.task.txt
Tweet 808659443504349184 is outside of Australia. Skipping it.
Tweet 808659443504349184 is outside of Australia. Skipping it.
Tweet 808659443504349184 is outside of Australia. Skipping it.
Tweet 808659443504349184 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/977830112446967808.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/977830112446967808.task.txt
888311376384409602 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/977830112446967808.task.txt
Tweet 977830112446967808 is outside of Australia. Skipping it.
Tweet 977830112446967808 is outside of Australia. Skippi

New task was queued to ../shared_folder/geo_analyser_tasks_prod/756046848607911936.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/756046848607911936.task.txt
2282724978 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/756046848607911936.task.txt
Tweet 756046848607911936 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/983920064498839552.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/983920064498839552.task.txt
888311376384409602 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/983920064498839552.task.txt
New task was queued to ../shared_folder/geo_analyser_tasks_prod/982434991782084608.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/982434991782084608.task.txt
223327949 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/982434991782084608.task.t

797947609 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/979625953264484352.task.txt
Tweet 979625953264484352 is outside of Australia. Skipping it.
Tweet 979625953264484352 is outside of Australia. Skipping it.
Tweet 979625953264484352 is outside of Australia. Skipping it.
Tweet 979625953264484352 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/985116347351912449.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/985116347351912449.task.txt
424101271 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/985116347351912449.task.txt
Tweet 985116347351912449 is outside of Australia. Skipping it.
Tweet 985116347351912449 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/981523528825544704.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/981523528825544704.task.txt
27

255408879 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/971519807119007745.task.txt
New task was queued to ../shared_folder/geo_analyser_tasks_prod/988599511000870912.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/988599511000870912.task.txt
321714801 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/988599511000870912.task.txt
Tweet 988599511000870912 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/988951692589727744.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/988951692589727744.task.txt
53884325 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/988951692589727744.task.txt
New task was queued to ../shared_folder/geo_analyser_tasks_prod/989994576160219144.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/989994576160219144.task.txt
41587204

New task was queued to ../shared_folder/topic_analyser_tasks/982563977178693632.task.txt
Tweet 982563977178693632 is outside of Australia. Skipping it.
Tweet 982563977178693632 is outside of Australia. Skipping it.
Tweet 982563977178693632 is outside of Australia. Skipping it.
Tweet 982563977178693632 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/989268404790902784.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/989268404790902784.task.txt
774554263829610496 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/989268404790902784.task.txt
Tweet 989268404790902784 is outside of Australia. Skipping it.
Tweet 989268404790902784 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/985761466446786560.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/985761466446786560.task.txt
81915976 is known user. Skipping 

New task was queued to ../shared_folder/user_tweets_tasks/2784890414.task.txt
New task was queued to ../shared_folder/topic_analyser_tasks/882003863112265728.task.txt
Tweet 882003863112265728 is outside of Australia. Skipping it.
Tweet 882003863112265728 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/935106149543591938.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/935106149543591938.task.txt
1923192949 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/935106149543591938.task.txt
Tweet 935106149543591938 is outside of Australia. Skipping it.
Tweet 935106149543591938 is outside of Australia. Skipping it.
Tweet 935106149543591938 is outside of Australia. Skipping it.
Tweet 935106149543591938 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/968313933433057282.task.txt
New task was queued to ../shared_folder/sentiment_ta

New task was queued to ../shared_folder/geo_analyser_tasks_prod/988195349536763904.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/988195349536763904.task.txt
New task was queued to ../shared_folder/user_tweets_tasks/169810337.task.txt
New task was queued to ../shared_folder/topic_analyser_tasks/988195349536763904.task.txt
Tweet 988195349536763904 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/830841782267105280.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/830841782267105280.task.txt
47901916 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/830841782267105280.task.txt
Tweet 830841782267105280 is outside of Australia. Skipping it.
Tweet 830841782267105280 is outside of Australia. Skipping it.
Tweet 830841782267105280 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/971350673366269954.task.txt


850149515625512960 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/991570268458967041.task.txt
Tweet 991570268458967041 is outside of Australia. Skipping it.
Tweet 991570268458967041 is outside of Australia. Skipping it.
Tweet 991570268458967041 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/786835645872295936.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/786835645872295936.task.txt
New task was queued to ../shared_folder/user_tweets_tasks/39878267.task.txt
New task was queued to ../shared_folder/topic_analyser_tasks/786835645872295936.task.txt
Tweet 786835645872295936 is outside of Australia. Skipping it.
Tweet 786835645872295936 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/799599664186068992.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/799599664186068992.task.txt
276938035 is known u

783909710 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/980803166013816834.task.txt
Tweet 980803166013816834 is outside of Australia. Skipping it.
Tweet 980803166013816834 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/993013540444237824.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/993013540444237824.task.txt
26071407 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/993013540444237824.task.txt
Tweet 993013540444237824 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/923468951425753089.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/923468951425753089.task.txt
361713978 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/923468951425753089.task.txt
Tweet 923468951425753089 is outside of Australia. Skipping it.


771162598528004097 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/897744967245942784.task.txt
New task was queued to ../shared_folder/geo_analyser_tasks_prod/986881939234635776.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/986881939234635776.task.txt
1464708144 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/986881939234635776.task.txt
Tweet 986881939234635776 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/954687487988740096.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/954687487988740096.task.txt
4773313382 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/954687487988740096.task.txt
Tweet 954687487988740096 is outside of Australia. Skipping it.
Tweet 954687487988740096 is outside of Australia. Skipping it.
Tweet 954687487988740096 is outside of Australia. S

New task was queued to ../shared_folder/geo_analyser_tasks_prod/953801265800085504.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/953801265800085504.task.txt
850149515625512960 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/953801265800085504.task.txt
Tweet 953801265800085504 is outside of Australia. Skipping it.
Tweet 953801265800085504 is outside of Australia. Skipping it.
Tweet 953801265800085504 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/904910648584282112.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/904910648584282112.task.txt
17173214 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/904910648584282112.task.txt
Tweet 904910648584282112 is outside of Australia. Skipping it.
Tweet 904910648584282112 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_task

New task was queued to ../shared_folder/geo_analyser_tasks_prod/988001342210822146.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/988001342210822146.task.txt
202106636 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/988001342210822146.task.txt
Tweet 988001342210822146 is outside of Australia. Skipping it.
Tweet 988001342210822146 is outside of Australia. Skipping it.
Tweet 988001342210822146 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/974897325226147840.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/974897325226147840.task.txt
398590518 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/974897325226147840.task.txt
Tweet 974897325226147840 is outside of Australia. Skipping it.
Tweet 974897325226147840 is outside of Australia. Skipping it.
Tweet 974897325226147840 is outside of Australia. Skipping it.
Tw

New task was queued to ../shared_folder/geo_analyser_tasks_prod/796249619294404609.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/796249619294404609.task.txt
62335500 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/796249619294404609.task.txt
Tweet 796249619294404609 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/929648026217357314.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/929648026217357314.task.txt
New task was queued to ../shared_folder/user_tweets_tasks/26109967.task.txt
New task was queued to ../shared_folder/topic_analyser_tasks/929648026217357314.task.txt
New task was queued to ../shared_folder/geo_analyser_tasks_prod/984778068391673856.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/984778068391673856.task.txt
994184580 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/

New task was queued to ../shared_folder/geo_analyser_tasks_prod/944358834226675712.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/944358834226675712.task.txt
New task was queued to ../shared_folder/user_tweets_tasks/1413468168.task.txt
New task was queued to ../shared_folder/topic_analyser_tasks/944358834226675712.task.txt
Tweet 944358834226675712 is outside of Australia. Skipping it.
Tweet 944358834226675712 is outside of Australia. Skipping it.
Tweet 944358834226675712 is outside of Australia. Skipping it.
Tweet 944358834226675712 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/987465890168913920.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/987465890168913920.task.txt
New task was queued to ../shared_folder/user_tweets_tasks/276827565.task.txt
New task was queued to ../shared_folder/topic_analyser_tasks/987465890168913920.task.txt
New task was queued to ../shared_folder/geo_analyser_

New task was queued to ../shared_folder/geo_analyser_tasks_prod/970521140060856320.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/970521140060856320.task.txt
338707356 is known user. Skipping them.
New task was queued to ../shared_folder/topic_analyser_tasks/970521140060856320.task.txt
Tweet 970521140060856320 is outside of Australia. Skipping it.
Tweet 970521140060856320 is outside of Australia. Skipping it.
Tweet 970521140060856320 is outside of Australia. Skipping it.
Tweet 970521140060856320 is outside of Australia. Skipping it.
New task was queued to ../shared_folder/geo_analyser_tasks_prod/916290961478647810.task.txt
New task was queued to ../shared_folder/sentiment_tasks_prod/916290961478647810.task.txt
New task was queued to ../shared_folder/user_tweets_tasks/3572002514.task.txt
New task was queued to ../shared_folder/topic_analyser_tasks/916290961478647810.task.txt
Tweet 916290961478647810 is outside of Australia. Skipping it.
Tweet 916290961478647810 is

KeyboardInterrupt: 

# Copy tweets for reprocessing

In [None]:
processed_tweets_path, tweets_queue_path

In [None]:
#!for i in ../shared_folder/processed_tweets/*.json; do cp $i ../shared_folder/tweets/; done