
# ================================================
# Data Collection & Pre-processing Pipeline
# ================================================

### ----------------- STEP 1: Import Library's -----------------

In [1]:
import pandas as pd
import json
from datetime import datetime

### ----------------- STEP 2: Paths -----------------
### Put your CSV inside your Drive


In [2]:
INPUT_CSV_PATH = "Skill Gap Analysis & Career Path Optimization.csv"
OUTPUT_CSV_PATH = "Skill_Gap_Cleaned.csv"
# final (cleaned + reordered)
REPORT_JSON_PATH = "data_cleaning_report.json"

### ----------------- STEP 3: Load Data -----------------

In [3]:
df = pd.read_csv(INPUT_CSV_PATH)

### ----------------- STEP 4: Cleaning Functions -----------------

In [4]:
def tokenize_list(cell):
    if pd.isna(cell):
        return []
    return [s.strip() for s in str(cell).split(",") if s.strip()]

def safe_numeric(series):
    return pd.to_numeric(series, errors="coerce")

def safe_datetime(series, fmt="%d-%m-%Y"):
    return pd.to_datetime(series, format=fmt, errors="coerce")

### ----------------- STEP 5: Cleaning -----------------

In [5]:
clean = df.copy()

# Dates
clean["posting_date"] = safe_datetime(clean["posting_date"], "%d-%m-%Y")
clean["application_deadline"] = safe_datetime(clean["application_deadline"], "%d-%m-%Y")

# Expand abbreviations
exp_map = {"SE": "Senior", "MI": "Mid-level", "EN": "Entry-level", "EX": "Executive"}
emp_map = {"CT": "Contract", "FL": "Freelance", "PT": "Part-time", "FT": "Full-time"}
size_map = {"S": "Small", "M": "Medium", "L": "Large"}

if "experience_level" in clean.columns:
    clean["experience_level"] = clean["experience_level"].map(exp_map).fillna(clean["experience_level"])
if "employment_type" in clean.columns:
    clean["employment_type"] = clean["employment_type"].map(emp_map).fillna(clean["employment_type"])
if "company_size" in clean.columns:
    clean["company_size"] = clean["company_size"].map(size_map).fillna(clean["company_size"])

# Tokenize skills
if "skills" in clean.columns:
    clean["skills"] = clean["skills"].apply(tokenize_list)
if "required_skills" in clean.columns:
    clean["required_skills"] = clean["required_skills"].apply(tokenize_list)

# Ensure numerics
for num_col in ["salary_usd", "salary", "age", "years_experience",
                "job_description_length", "benefits_score", "remote_ratio"]:
    if num_col in clean.columns:
        clean[num_col] = safe_numeric(clean[num_col])

### ----------------- STEP 6: Column Reorder -----------------
### Move 'salary' to be right after 'salary_currency'

In [6]:
if "salary" in clean.columns and "salary_currency" in clean.columns:
    cols = list(clean.columns)
    # only reorder if both columns exist and in different positions
    if "salary" in cols:
        cols.remove("salary")
        if "salary_currency" in cols:
            idx = cols.index("salary_currency")
            cols.insert(idx + 1, "salary")
            clean = clean[cols]

### ----------------- STEP 7: Save Outputs -----------------

In [7]:
clean.to_csv(OUTPUT_CSV_PATH, index=False)

report = {
    "dataset_description": {
        "source": INPUT_CSV_PATH,
        "rows": int(df.shape[0]),
        "cols": int(df.shape[1]),
        "columns_before": list(df.columns),
        "columns_after": list(clean.columns),
    },
    "cleaning_steps": {
        "date_conversion": ["posting_date", "application_deadline"],
        "mappings": ["experience_level", "employment_type", "company_size"],
        "tokenized_fields": [c for c in ["skills", "required_skills"] if c in df.columns],
        "numeric_coercions": [c for c in ["salary_usd","salary","age","years_experience",
                                          "job_description_length","benefits_score","remote_ratio"]
                              if c in df.columns],
        "reordering": "Placed 'salary' immediately after 'salary_currency' in final CSV"
    },
    "notes": [
        "All dates converted to datetime (coerce on parse errors).",
        "Abbreviations expanded where present.",
        "Skills columns converted to token lists.",
        "Numeric columns coerced to numeric.",
        "Final CSV saved with requested column order."
    ],
    "timestamp": datetime.now().isoformat(timespec="seconds")
}

with open(REPORT_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2, ensure_ascii=False)

print("✅ Cleaning & reordering complete!")
print(f"Cleaned dataset saved to: {OUTPUT_CSV_PATH}")
print(f"Cleaning report saved to: {REPORT_JSON_PATH}")


✅ Cleaning & reordering complete!
Cleaned dataset saved to: Skill_Gap_Cleaned.csv
Cleaning report saved to: data_cleaning_report.json


### ----------------- STEP 8: Quick Preview -----------------

In [8]:
clean.head(10)

Unnamed: 0,name,job_title,salary_usd,salary_currency,salary,experience_level,employment_type,company_location,company_size,employee_residence,...,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name,age,skills
0,Matthew Hernandez,Senior Data Scientist,90376.0,USD,90886,Senior,Contract,China,Medium,China,...,Bachelor,9,Automotive,2024-10-18,2024-11-07,1076,5.9,Smart Analytics,41,"[ETL, Architecture, Power BI, Kafka, NLP, Spark]"
1,Tyler Mcdonald,Senior Software Engineer,61895.0,USD,88450,Senior,Contract,Canada,Medium,Ireland,...,Master,1,Media,2024-11-20,2025-01-11,1268,5.2,TechCorp Inc,41,"[AWS, NLP, Recommendation Systems, Deep Learni..."
2,Stacey Brown,Tech Lead,152626.0,USD,65087,Senior,Freelance,Switzerland,Large,South Korea,...,Associate,2,Education,2025-03-18,2025-04-07,1974,9.4,Autonomous Tech,44,"[React, Linux, Machine Learning, Java, Power B..."
3,Wayne Mcdonald,Backend Developer,80215.0,USD,62121,Mid-level,Freelance,India,Medium,India,...,PhD,7,Consulting,2024-12-23,2025-02-24,1345,8.6,Future Systems,31,"[Communication, Data Analysis, React, Kubernetes]"
4,Lynn Mitchell,Software Engineer,54624.0,EUR,56739,Mid-level,Part-time,France,Small,Singapore,...,Master,0,Media,2025-04-15,2025-06-23,1989,6.6,Advanced Robotics,33,"[NumPy, Spark, Kafka, Testing/QA]"
5,Dana Terrell,Tech Lead,123574.0,EUR,72311,Senior,Contract,Germany,Medium,Germany,...,Associate,7,Healthcare,2024-08-31,2024-10-04,819,5.9,Neural Networks Co,30,"[Linux, Java, Stakeholder Management, C++, Sci..."
6,James Juarez,Backend Developer,79670.0,GBP,87382,Mid-level,Freelance,United Kingdom,Small,United Kingdom,...,Associate,3,Gaming,2024-12-29,2025-02-28,1936,6.3,DataVision Ltd,27,"[Git, Kafka, Teamwork, React]"
7,Jake Davis,Backend Developer,70640.0,EUR,81060,Mid-level,Freelance,France,Large,France,...,Master,0,Healthcare,2024-06-07,2024-07-01,1286,7.6,Cloud AI Solutions,33,"[Kafka, Tableau, Git, Python]"
8,Mary Hogan,Senior Data Scientist,160710.0,USD,112168,Senior,Contract,Singapore,Large,Singapore,...,PhD,7,Government,2024-11-04,2024-11-24,551,9.3,Quantum Computing Inc,42,"[Data Visualization, ETL, Feature Engineering,..."
9,Jessica Jones,AI Specialist,102557.0,USD,107634,Senior,Part-time,Austria,Medium,Austria,...,Master,5,Government,2024-10-20,2024-11-06,2340,5.8,Cloud AI Solutions,32,"[Feature Engineering, Project Management, Tens..."


In [9]:
clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30038 entries, 0 to 30037
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   name                    30038 non-null  object        
 1   job_title               30038 non-null  object        
 2   salary_usd              30038 non-null  float64       
 3   salary_currency         30038 non-null  object        
 4   salary                  30038 non-null  int64         
 5   experience_level        30038 non-null  object        
 6   employment_type         30038 non-null  object        
 7   company_location        30038 non-null  object        
 8   company_size            30038 non-null  object        
 9   employee_residence      30038 non-null  object        
 10  remote_ratio            30038 non-null  int64         
 11  required_skills         30038 non-null  object        
 12  education_required      30038 non-null  object