In [1]:
import sys
import json
import os
from src.data import generate_dataset
from src.data import clear
import pandas as pd
from twarc import Twarc
from datetime import datetime, date, timedelta
import run
%load_ext autoreload
%autoreload 2

In [2]:
def configure_twarc(api_keys_json):
    with open(api_keys_json) as f:
        keys = json.load(f)
        t = Twarc(
            keys['consumer_key'],
            keys['consumer_secret'],
            keys['access_token'],
            keys['access_token_secret']
        )
    return t

In [3]:
def sample(df, sample_rate, id_column):
    return df.iloc[::sample_rate, :][id_column]
def sample_files(raw_data_path, sample_rate, dehydrated_sample_path, id_column):
    if not os.path.exists(dehydrated_sample_path):
        os.makedirs(dehydrated_sample_path)
    # find the filenames 
    file_names = sorted([name for name in os.listdir(raw_data_path) if 'dataset' in name])
    # for every .tsv under the directory
    for file in file_names:
        # read the file into df
        df = pd.read_table(f'{os.path.join(raw_data_path, file)}')
        # sample it
        a_sample = sample(df, sample_rate, id_column)
        # get the saving file name {original_file_name}.txt 
        fname = file.split('.')[0] + '.txt'
        print(f'sampling for dataset on {fname}')
        # save the sample to the path
        a_sample.to_csv(os.path.join(dehydrated_sample_path, fname), index = False, header = None)

In [4]:
# helper function
def obliviate(path):
    for fname in os.listdir(path):
        print(f'deleting{fname} under {path}')
        file_path = os.path.join(path, fname)
        try:
            if os.path.isfile(file_path):
                # ELDRITCH BLAST!!!!
                os.remove(file_path)
            elif os.path.isdir(file_path):
                # ELDRITCH BLAST!!!!
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')

# clear script
def clean(paths):
    for path in paths:
        obliviate(path)
    

In [43]:
def sample_file(file_path, sample_rate, id_column):
    df = pd.read_csv(file_path, sep='\t')
    return df.iloc[::sample_rate, :][id_column]

# Get the information from raw tweets we just obtained
# processed_data_path is the path for the sampled dehydrated ids
def rehydrate_tweets(raw_data_path, project_path, json_data_path, sample_rate, id_column, api_keys_json):
    # Sample data and write to processed_data_path
#     sample_files(raw_data_path, sample_rate, processed_data_path, id_column)
    
    t = configure_twarc(api_keys_json)
    # Rehydrate text file
    if not os.path.exists(json_data_path):
        os.makedirs(json_data_path)
        
    sample_names = set([name.split('.')[0] for name in os.listdir(raw_data_path) if '202' in name])
    json_names = set([name.split('.')[0] for name in os.listdir(json_data_path) if '202' in name])
    missing_names = sample_names - json_names
    print(f'here are the missing jsons: {missing_names}')
        
    for file in sorted(missing_names):
        # absolute path for txt id file
        abs_path = os.path.join(raw_data_path, file + '.tsv')
        data_sample = sample_file(abs_path, sample_rate, id_column)
        # absolute path for target directory
        name = file + '.jsonl'
        abs_target_path = json_data_path + name
        print(f'saving to {abs_target_path}')
        
        with open(abs_target_path, 'w') as outfile:
            for tweet in t.hydrate(data_sample):
                outfile.write(json.dumps(tweet) + '\n')

In [6]:
with open('./config/data_params.json') as f:
    data_params = json.load(f)
with open('./config/sample_params.json') as f:
    sample_params = json.load(f)

# Cfg variables
raw_data_path = data_params['raw_data_path']
project_path = data_params['absolute_project_path']
twarc_path = data_params['twarc_path']
rehydrated_json_path = data_params['rehydrated_json_path']
id_column = data_params['id_column']
from_day = data_params['from_day']
to_day = data_params['to_day']
want_cleaned = data_params['want_cleaned']
api_keys_path = data_params['api_keys']

sample_rate = sample_params['sample_every']


In [45]:
rehydrate_tweets(raw_data_path, project_path, rehydrated_json_path, sample_rate, id_column, api_keys_path)

here are the missing jsons: {'2020-07-08-dataset', '2020-06-10-dataset', '2020-07-14-dataset', '2020-06-28-dataset', '2020-05-25-dataset', '2020-07-26-dataset', '2020-06-19-dataset', '2020-07-24-dataset', '2020-07-03-dataset', '2020-06-29-dataset', '2020-05-26-dataset', '2020-07-11-dataset', '2020-07-16-dataset', '2020-06-24-dataset', '2020-07-18-dataset', '2020-06-25-dataset', '2020-06-08-dataset', '2020-06-22-dataset', '2020-05-31-dataset', '2020-07-01-dataset', '2020-07-22-dataset', '2020-06-27-dataset', '2020-07-25-dataset', '2020-06-18-dataset', '2020-06-01-dataset', '2020-06-02-dataset', '2020-07-06-dataset', '2020-07-29-dataset', '2020-06-12-dataset', '2020-07-15-dataset', '2020-06-05-dataset', '2020-07-07-dataset', '2020-06-04-dataset', '2020-06-09-dataset', '2020-07-04-dataset', '2020-07-12-dataset', '2020-07-27-dataset', '2020-06-20-dataset', '2020-06-13-dataset', '2020-07-17-dataset', '2020-06-23-dataset', '2020-05-29-dataset', '2020-07-21-dataset', '2020-06-06-dataset', '20



KeyboardInterrupt: 

In [6]:
sample_names = set([name.split('.')[0] for name in os.listdir(processed_data_path)])
json_names = set([name.split('.')[0] for name in os.listdir(rehydrated_json_path)])
missing_names = sample_names - json_names
sample_filename = [name + '.txt' for name in missing_names]

In [8]:
# generate_dataset.download_latest_datasets(raw_data_path, from_day, to_day, want_cleaned)

In [9]:
# generate_dataset.rehydrate_tweets(raw_data_path, processed_data_path, project_path, rehydrated_json_path, sample_rate, id_column, twarc_path)