# Unify dataset

Get all the dataset batch files and combine them together into one dataset

In [None]:
import pandas as pd
import os
import shutil
import glob
import time

In [None]:
path = "."

In [None]:
unified_path = os.path.join(path, "unifiedDataset")
shutil.rmtree(unified_path, ignore_errors=True)
os.makedirs(unified_path)

In [None]:
NO_TOPICS = 8

In [None]:
def load_data(file_name, sr):
    df = pd.read_csv(file_name, skiprows=sr, header=0)
    if os.path.basename(file_name) == "tweets.csv":
        df["topic"] = os.path.basename(os.path.dirname(file_name))
    return df

In [None]:
datasets = []
for dataset_dir in glob.glob(os.path.join(path, "dataset*")):
    for f in glob.glob(os.path.join(dataset_dir, "*/tweets.csv")):
        datasets.append(f)
all_tweets_df = pd.concat([load_data(f, 0) for f in datasets])

In [None]:
all_tweets_df.drop_duplicates(subset="id", inplace=True)
all_tweets_df.drop_duplicates(subset="text", inplace=True)

In [None]:
all_tweets_df.info()

In [None]:
# get filenames for virality info for all topics for each dataset
virality_files = []
for dataset_dir in map(os.path.dirname, datasets):
    virality_files.append([])
    files = glob.glob(os.path.join(dataset_dir, "[0-9-T_]*.csv"))
    files = sorted(
        files,
        key=lambda t: time.strptime(t[-23:-4], "%Y-%m-%dT%H_%M_%S")
    )
    virality_files[-1].extend(files[:24])

In [None]:
# combines dataframes into one dataframe, removes duplicates and then saves as name
def combine_dfs(frames, name):
    all_df = pd.concat(frames)
    all_df.drop_duplicates(subset="id", inplace=True)
    all_df.to_csv(os.path.join(unified_path, name.split(".")[0] + ".csv"), index=False, encoding="utf-8")

In [None]:
for i, files in enumerate(zip(*virality_files)): # gives lines each with corresponding hour number that tweet was retrieved
    frames = [load_data(f, 0) for f in files]
    combine_dfs(frames, str(i))

In [None]:
combine_dfs([load_data(f, 0) for f in datasets], "tweets")

# Reduced dataset

We have unified the data batches into one dataset, now we need to reduce the dataset by making the amount of tweets with zero retweets equal to those with at least one retweet

In [None]:
path = "./unifiedDataset"

In [None]:
tweets_df = pd.read_csv(os.path.join(path, "tweets.csv"), header=0)

In [None]:
vir_df = pd.read_csv(os.path.join(path, "23.csv"), header=0)

In [None]:
tweets_df = tweets_df.merge(vir_df, on="id", validate="one_to_one")

In [None]:
sample_len = len(tweets_df) - len(tweets_df[tweets_df["retweets"] == 0])
print(sample_len)

In [None]:
no_rt_df = tweets_df.loc[tweets_df["retweets"] == 0].sample(n=sample_len, random_state=42)
rt_df = tweets_df.loc[tweets_df["retweets"] != 0].sample(frac=1, random_state=42)
tweets_df = pd.concat([rt_df, no_rt_df])

In [None]:
reduced_path = os.path.join(path, "reducedDataset")
shutil.rmtree(reduced_path, ignore_errors=True)
os.makedirs(reduced_path)

In [None]:
tweets_df.to_csv(os.path.join(reduced_path, "all.csv"), index=False, encoding="utf-8")