In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
import os, json, glob
from tqdm import tqdm
from datasets import load_dataset, Dataset, Features, Value

load_dotenv()
HUGGINGFACE_API_KEY= os.getenv("HUGGINGFACE_API_KEY")
login(HUGGINGFACE_API_KEY)

In [None]:
dataset = load_dataset("Savoxism/andrew_tate_long_form")
dataset

In [None]:
folder_path = "data_2"
print(f"Loading local files from {folder_path}...")
local_data = []

file_paths = glob.glob(os.path.join(folder_path, "*.txt"))

for file_path in tqdm(file_paths, desc="Reading local files"):
    file_id = os.path.basename(file_path).split('.')[0]
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        local_data.append({"id": file_id, "content": content})
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

In [None]:
combined_data = []

if 'train' in dataset:
    for item in tqdm(dataset['train'], desc="processing hf dataset"):
        combined_data.append(
            {"id": item["id"],
             "content": item["content"]
            }
        )

combined_data.extend(local_data)

In [None]:
output_path = "combined_dataset.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(combined_data, f, ensure_ascii=False, indent=2)
    print(f"saved {len(combined_data)} items to {output_path}")

In [None]:
hf_dataset_features = Features(
    {"id": Value("string"),
     "content": Value("string")
    }
)

hf_dataset = Dataset.from_list(combined_data, features=hf_dataset_features)

In [None]:
hf_dataset

In [None]:
hf_dataset.push_to_hub(
    "Savoxism/andrew_tate_long_form_final",
    token=HUGGINGFACE_API_KEY,
    private=False,
    commit_message="Upload combined dataset with local files"
)

print("done")