In [4]:
import pandas as pd
import os

def split_dataset(input_file, output_folder, num_files=10):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Read the full dataset with error handling
    try:
        df = pd.read_csv(input_file, on_bad_lines='warn')  # Skip rows with column mismatch
    except FileNotFoundError:
        print(f"❌ File not found: {input_file}")
        return
    except Exception as e:
        print(f"❌ Error reading file: {e}")
        return

    total_rows = len(df)
    chunk_size = total_rows // num_files

    print(f"📄 Total rows after cleanup: {total_rows}")
    print(f"📦 Splitting into {num_files} files (~{chunk_size} rows each)")

    # Split and save
    for i in range(num_files):
        start = i * chunk_size
        end = (i + 1) * chunk_size if i < num_files - 1 else total_rows
        chunk = df.iloc[start:end]
        output_path = os.path.join(output_folder, f'day{i+1}.csv')
        chunk.to_csv(output_path, index=False)
        print(f"✅ Created {output_path} with {len(chunk)} rows")

    print("\n🎉 Dataset splitting complete!")


if __name__ == "__main__":
    # Modify these paths if needed
    input_file = '/content/postings_cleaned.csv'
    output_folder = 'data/feed_simulated/'

    split_dataset(input_file, output_folder)


Skipping line 1727: expected 13 fields, saw 16
Skipping line 2379: expected 13 fields, saw 14
Skipping line 3260: expected 13 fields, saw 15
Skipping line 3954: expected 13 fields, saw 21
Skipping line 4368: expected 13 fields, saw 14

  df = pd.read_csv(input_file, on_bad_lines='warn')  # Skip rows with column mismatch


📄 Total rows after cleanup: 5194
📦 Splitting into 10 files (~519 rows each)
✅ Created data/feed_simulated/day1.csv with 519 rows
✅ Created data/feed_simulated/day2.csv with 519 rows
✅ Created data/feed_simulated/day3.csv with 519 rows
✅ Created data/feed_simulated/day4.csv with 519 rows
✅ Created data/feed_simulated/day5.csv with 519 rows
✅ Created data/feed_simulated/day6.csv with 519 rows
✅ Created data/feed_simulated/day7.csv with 519 rows
✅ Created data/feed_simulated/day8.csv with 519 rows
✅ Created data/feed_simulated/day9.csv with 519 rows
✅ Created data/feed_simulated/day10.csv with 523 rows

🎉 Dataset splitting complete!
