In [None]:
import os
import json
import glob
from tqdm import tqdm
from datasets import load_dataset, Dataset, Features, Value

dataset = load_dataset("Savoxism/andrew-tate-long-form")
dataset

In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv()

HUGGINGFACE_API_KEY= os.getenv("HUGGINGFACE_API_KEY")
login(HUGGINGFACE_API_KEY)

In [None]:
folder_path = "vast_1"
print(f"Loading local files from {folder_path}...")
local_data = []

file_paths = glob.glob(os.path.join(folder_path, "*.txt"))

for file_path in tqdm(file_paths, desc="Reading local files"):
    file_id = os.path.basename(file_path).split('.')[0]
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        local_data.append({"id": file_id, "content": content})
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

In [None]:
def concatenate_datasets(hf_dataset, local_data):
    print("Concatenating datasets...")
    
    combined_data = []
    
    if 'train' in hf_dataset:
        for item in tqdm(hf_dataset['train'], desc="Processing HF dataset"):
            combined_data.append({"id": item["id"], "content": item["content"]})
    
    combined_data.extend(local_data)
    
    return combined_data


def save_combined_dataset(combined_data, output_path="combined_dataset.json"):
    print(f"Saving combined dataset to {output_path}...")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(combined_data, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(combined_data)} records to {output_path}")


combined_data = concatenate_datasets(dataset, local_data)
save_combined_dataset(combined_data)
    
print("Dataset processing complete!")

In [None]:
print("Converting combined data to Huggingface dataset format...")
features = Features({
    'id': Value('string'),
    'content': Value('string')
})

hf_dataset = Dataset.from_list(combined_data, features=features)


print("Pushing dataset to Huggingface Hub...")
hf_dataset.push_to_hub(
    "Savoxism/andrew-tate-long-form_2",
    token=None,  
    private=False,  
    commit_message="Upload combined dataset with additional content"
)

print("Dataset successfully uploaded to Huggingface!")