In [1]:
import sys
import json
import os
from src.data import generate_dataset
from src.data import clear
import pandas as pd

In [None]:
def sample(df, sample_rate, id_column):
    return df.iloc[::sample_rate, :][id_column]
def sample_files(raw_data_path, sample_rate, dehydrated_sample_path, id_column):
    if not os.path.exists(dehydrated_sample_path):
        os.makedirs(dehydrated_sample_path)
    # find the filenames 
    file_names = sorted([name for name in os.listdir(raw_data_path) if 'dataset' in name])
    # for every .tsv under the directory
    for file in file_names:
        # read the file into df
        df = pd.read_table(f'{os.path.join(raw_data_path, file)}')
        # sample it
        a_sample = sample(df, sample_rate, id_column)
        # get the saving file name {original_file_name}.txt 
        fname = file.split('.')[0] + '.txt'
        print(f'sampling for dataset on {fname}')
        # save the sample to the path
        a_sample.to_csv(os.path.join(dehydrated_sample_path, fname), index = False, header = None)

In [None]:
# helper function
def obliviate(path):
    for fname in os.listdir(path):
        file_path = os.path.join(path, fname)
        try:
            if os.path.isfile(file_path):
                # ELDRITCH BLAST!!!!
                os.remove(file_path)
            elif os.path.isdir(file_path):
                # ELDRITCH BLAST!!!!
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')

# clear script
def clean(paths):
    for path in paths:
        obliviate(path)
    

In [None]:
def rehydrate_tweets(raw_data_path, processed_data_path, project_path, json_data_path, sample_rate, id_column, twarc_location):
    # Sample data and write to processed_data_path
#     sample_files(raw_data_path, sample_rate, processed_data_path, id_column)
    
    # Rehydrate text file
    if not os.path.exists(json_data_path):
        os.makedirs(json_data_path)
    for file in os.listdir(processed_data_path):
        # absolute path for txt id file
        abs_path = project_path + os.path.join(processed_data_path, file)
        # absolute path for target directory
        name = file.split('.')[0] + '.jsonl'
        abs_target_path = project_path + json_data_path + name
        print(f'saving to {abs_target_path}')
        
        os.system(f'{twarc_location} hydrate {abs_path} > {abs_target_path}')

In [2]:
with open('./config/data_params.json') as f:
    data_params = json.load(f)
with open('./config/sample_params.json') as f:
    sample_params = json.load(f)

# Cfg variables
raw_data_path = data_params['raw_data_path']
processed_data_path = data_params['dehydrated_data_path']
project_path = data_params['absolute_project_path']
twarc_path = data_params['twarc_path']
rehydrated_json_path = data_params['rehydrated_json_path']
rehydrated_df_path = data_params['rehydrated_df_path']
id_column = data_params['id_column']
from_day = data_params['from_day']
to_day = data_params['to_day']
want_cleaned = data_params['want_cleaned']

sample_rate = sample_params['sample_every']


In [3]:
clear.clean([raw_data_path, processed_data_path])

In [4]:
generate_dataset.download_latest_datasets(raw_data_path, from_day, to_day, want_cleaned)

requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-03-22/2020-03-22-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-03-23/2020-03-23-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-03-24/2020-03-24-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-03-25/2020-03-25-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-03-26/2020-03-26-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-03-27/2020-03-27-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-03-28/2020-03-28-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailie

requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-05-24/2020-05-24-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-05-25/2020-05-25-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-05-26/2020-05-26-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-05-27/2020-05-27-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-05-28/2020-05-28-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-05-29/2020-05-29-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-05-30/2020-05-30-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailie

requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-07-26/2020-07-26-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-07-27/2020-07-27-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-07-28/2020-07-28-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-07-29/2020-07-29-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-07-30/2020-07-30-dataset.tsv.gz?raw=true
requesting from https://github.com/thepanacealab/covid19_twitter/blob/master/dailies/2020-07-31/2020-07-31-dataset.tsv.gz?raw=true


In [None]:
generate_dataset.rehydrate_tweets(raw_data_path, processed_data_path, project_path, rehydrated_json_path, sample_rate, id_column, twarc_path)

sampling for dataset on 2020-03-22-dataset.txt
sampling for dataset on 2020-03-23-dataset.txt
sampling for dataset on 2020-03-24-dataset.txt
sampling for dataset on 2020-03-25-dataset.txt
sampling for dataset on 2020-03-26-dataset.txt
sampling for dataset on 2020-03-27-dataset.txt
sampling for dataset on 2020-03-28-dataset.txt
sampling for dataset on 2020-03-29-dataset.txt
sampling for dataset on 2020-03-30-dataset.txt
sampling for dataset on 2020-03-31-dataset.txt
sampling for dataset on 2020-04-01-dataset.txt
sampling for dataset on 2020-04-02-dataset.txt
sampling for dataset on 2020-04-03-dataset.txt
sampling for dataset on 2020-04-04-dataset.txt
sampling for dataset on 2020-04-05-dataset.txt
sampling for dataset on 2020-04-06-dataset.txt
sampling for dataset on 2020-04-07-dataset.txt
sampling for dataset on 2020-04-08-dataset.txt
sampling for dataset on 2020-04-09-dataset.txt
sampling for dataset on 2020-04-10-dataset.txt
sampling for dataset on 2020-04-11-dataset.txt
sampling for 