In [6]:
import os

folders = [
    'data/feed_simulated',
    'data/processed',
    'logs'
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)

print("✅ Folder structure ready.")


✅ Folder structure ready.


In [7]:
import os
import pandas as pd
from datetime import datetime, timedelta

# -------------------------
# CONFIGURATION
# -------------------------
FULL_DATASET = '/content/postings_cleaned.csv'  # <-- Your full uploaded file
FEED_FOLDER = 'data/feed_simulated'
OUTPUT_FOLDER = 'data/processed'
LOG_FILE = 'logs/ingestion.log'
SKILLS_COLUMN = 'job_skills_cleaned'  # Adjust if needed
START_DATE = datetime(2025, 7, 1)  # Simulated start date for day1

# -------------------------
# SETUP FOLDERS
# -------------------------
for folder in [FEED_FOLDER, OUTPUT_FOLDER, 'logs']:
    os.makedirs(folder, exist_ok=True)

# -------------------------
# SPLIT FULL DATA INTO 10 FILES
# -------------------------
if not os.path.exists(os.path.join(FEED_FOLDER, 'day1.csv')):
    print("📦 Splitting 'full_jobs.csv' into 10 feed files...")
    try:
        df_full = pd.read_csv(FULL_DATASET)
        chunk_size = len(df_full) // 10
        for i in range(10):
            start = i * chunk_size
            end = (i + 1) * chunk_size if i < 9 else len(df_full)
            chunk = df_full.iloc[start:end]
            chunk.to_csv(os.path.join(FEED_FOLDER, f'day{i+1}.csv'), index=False)
        print("✅ Done: Created day1.csv to day10.csv in data/feed_simulated/")
    except Exception as e:
        print(f"❌ Error reading '{FULL_DATASET}': {e}")
        exit()

# -------------------------
# FUNCTIONS
# -------------------------

def get_next_file():
    processed = set()
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, 'r') as f:
            processed = set(line.strip() for line in f)
    all_files = sorted(f for f in os.listdir(FEED_FOLDER) if f.endswith('.csv'))
    for file in all_files:
        if file not in processed:
            return file
    return None

def extract_top_skills(df):
    all_skills = df[SKILLS_COLUMN].dropna().str.split(',').explode().str.strip().str.lower()
    top_skills = all_skills.value_counts().head(20).reset_index()
    top_skills.columns = ['skill', 'count']
    return top_skills

def save_output(df, output_path):
    df.to_csv(output_path, index=False)
    print(f"✅ Saved: {output_path}")

def log_file(file_name):
    with open(LOG_FILE, 'a') as f:
        f.write(file_name + '\n')

def main():
    next_file = get_next_file()
    if not next_file:
        print("🎉 All files processed.")
        return

    print(f"📥 Processing: {next_file}")
    try:
        df = pd.read_csv(os.path.join(FEED_FOLDER, next_file))
    except Exception as e:
        print(f"❌ Error reading {next_file}: {e}")
        return

    if SKILLS_COLUMN not in df.columns:
        print(f"❌ Column '{SKILLS_COLUMN}' not found in data.")
        return

    top_skills = extract_top_skills(df)

    day_id = next_file.replace('.csv', '')        # e.g., 'day1'
    day_num = int(day_id[3:])                     # e.g., 1
    simulated_date = START_DATE + timedelta(days=day_num - 1)

    top_skills['date'] = simulated_date.date()

    output_file = os.path.join(OUTPUT_FOLDER, f'skills_{day_id}.csv')
    save_output(top_skills, output_file)
    log_file(next_file)
    print(f"📝 Logged: {next_file}")

# -------------------------
# RUN LOOP FOR ALL 10 DAYS
# -------------------------
if __name__ == '__main__':
    for _ in range(10):
        main()


📦 Splitting 'full_jobs.csv' into 10 feed files...
✅ Done: Created day1.csv to day10.csv in data/feed_simulated/
📥 Processing: day1.csv
✅ Saved: data/processed/skills_day1.csv
📝 Logged: day1.csv
📥 Processing: day10.csv
✅ Saved: data/processed/skills_day10.csv
📝 Logged: day10.csv
📥 Processing: day2.csv
✅ Saved: data/processed/skills_day2.csv
📝 Logged: day2.csv
📥 Processing: day3.csv
✅ Saved: data/processed/skills_day3.csv
📝 Logged: day3.csv
📥 Processing: day4.csv
✅ Saved: data/processed/skills_day4.csv
📝 Logged: day4.csv
📥 Processing: day5.csv
✅ Saved: data/processed/skills_day5.csv
📝 Logged: day5.csv
📥 Processing: day6.csv
✅ Saved: data/processed/skills_day6.csv
📝 Logged: day6.csv
📥 Processing: day7.csv
✅ Saved: data/processed/skills_day7.csv
📝 Logged: day7.csv
📥 Processing: day8.csv
✅ Saved: data/processed/skills_day8.csv
📝 Logged: day8.csv
📥 Processing: day9.csv
✅ Saved: data/processed/skills_day9.csv
📝 Logged: day9.csv


In [8]:
main()


🎉 All files processed.


In [9]:
import os

feed_files = sorted(os.listdir('data/feed_simulated/'))
print("📁 Feed Files Found:", feed_files)


📁 Feed Files Found: ['day1.csv', 'day10.csv', 'day2.csv', 'day3.csv', 'day4.csv', 'day5.csv', 'day6.csv', 'day7.csv', 'day8.csv', 'day9.csv']
