In [None]:
import pandas as pd

# === Load each dataset ===
coursera = pd.read_csv("coursera.csv")  # Replace with your filename
udemy = pd.read_csv("udemy_courses.csv")
edx = pd.read_csv("edx_courses.csv")

# === Clean and rename columns ===

# --- Coursera ---
coursera = coursera.rename(columns={
    "Course Name": "course_title",
    "Rating": "rating",
    "Difficulty Level": "level",
    "Subjects": "tags"
})
coursera["platform"] = "Coursera"
coursera = coursera[["course_title", "platform", "tags", "level", "rating"]]

# --- Udemy ---
udemy = udemy.rename(columns={
    "title": "course_title",
    "level": "level",
    "subject": "tags",
    "num_subscribers": "rating"  # We use subscribers as proxy since actual rating may not be present
})
udemy["platform"] = "Udemy"
udemy["rating"] = udemy["rating"].apply(lambda x: min(x / 1000, 5))  # Normalize to 0–5
udemy = udemy[["course_title", "platform", "tags", "level", "rating"]]

# --- edX ---
edx = edx.rename(columns={
    "course_title": "course_title",
    "difficulty": "level",
    "subject": "tags"
})
edx["platform"] = "edX"
edx["rating"] = 4.0  # edX dataset often lacks ratings; you can assign a default
edx = edx[["course_title", "platform", "tags", "level", "rating"]]

# === Combine all datasets ===
combined_df = pd.concat([coursera, udemy, edx], ignore_index=True)

# === Clean values ===
combined_df["tags"] = combined_df["tags"].fillna("").astype(str).str.replace(",", ";").str.replace("|", ";")
combined_df["level"] = combined_df["level"].fillna("Beginner")
combined_df["rating"] = combined_df["rating"].fillna(0)

# === Save final merged file ===
combined_df.to_csv("courses.csv", index=False)
print("✅ Merged dataset saved as courses.csv with", len(combined_df), "courses.")


In [10]:
print(coursera.columns.tolist())

['Unnamed: 0', 'course_title', 'course_organization', 'course_Certificate_type', 'course_rating', 'course_difficulty', 'course_students_enrolled', 'platform']
