# Phase 2 â€“ Data Preparation (ETL)

Phase 2 focuses on **cleaning, transformation, and structural parsing** of the job offer data.

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import datetime
from datetime import timedelta

nltk.download('stopwords')

# 1. Load Data
print("Loading Raw Data...")
try:
    df = pd.read_csv("hellowork_final_sectors_data.csv", encoding='utf-8-sig')
except:
    print("Raw file missing/empty, trying fallback...")
    df = pd.read_csv("hellowork_preprocessed.csv")

df = df.drop_duplicates()

# 2. Handle Text NaNs
text_columns = ["Job_Title", "Company", "Location", "Contract", "Description"]
for col in text_columns:
    df[col] = df[col].fillna("Not specified")

# 3. DATE PARSING (Real Data)
def parse_relative_date(date_str):
    if pd.isna(date_str) or date_str == "N/A": return np.nan
    date_str = str(date_str).lower()
    today = datetime.date.today()
    try:
        if "hier" in date_str: return today - timedelta(days=1)
        elif "aujourd'hui" in date_str: return today
        elif "il y a" in date_str:
            nums = re.findall(r'\d+', date_str)
            if nums:
                val = int(nums[0])
                if "mois" in date_str: return today - timedelta(days=val*30)
                elif "jour" in date_str: return today - timedelta(days=val)
                return today
        return pd.to_datetime(date_str, dayfirst=True, errors='coerce')
    except: return np.nan

print("Parsing Publication Dates...")
if 'Publication_Date' in df.columns:
    df['Publication_Date'] = df['Publication_Date'].apply(parse_relative_date)
    df['Publication_Date'] = pd.to_datetime(df['Publication_Date'])
else:
    print("Warning: Publication_Date column missing in input.")

# 4. Salary & Text Cleaning
df['Salary'] = pd.to_numeric(df['Salary'].str.replace(r'[^\d]', '', regex=True), errors='coerce')

def clean_salary(salary_str):
    if pd.isna(salary_str) or salary_str == "Not specified": return np.nan
    numbers = re.findall(r'\d+', salary_str.replace(" ", ""))
    numbers = [int(n) for n in numbers]
    if len(numbers) == 0: return np.nan
    elif len(numbers) == 1: return numbers[0]
    else: return sum(numbers)/len(numbers)

df['Salary_Clean'] = df['Salary'].astype(str).apply(clean_salary)

stop_words = set(stopwords.words('french'))
def clean_text(text):
    text = text.lower().replace("\n", " ").strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

df['Description_Clean'] = df['Description'].apply(clean_text)

# 5. Encoding
categorical_cols = ['Contract', 'Location', 'Sector']
for col in categorical_cols:
    le = LabelEncoder()
    df[col + '_Encoded'] = le.fit_transform(df[col])

print("Saving...")
df.to_csv("hellowork_preprocessed.csv", index=False)
print("ETL Done.")