In [None]:
# 01-data-cleaning.ipynb
# Purpose: Clean and prepare raw LinkedIn job data for analysis

import pandas as pd

# Load Excel file
df = pd.read_excel('linkedin-jobs-africa.xlsx', header=0)

# Check initial structure
print(df.columns)
df.head()

# Check missing values
df.isnull().mean().sort_values(ascending=False)

# Drop irrelevant column with mostly missing values
df = df.drop(columns=['SALARY'])

# Check missing values again
df.isnull().mean().sort_values(ascending=False)

# Normalize text fields (lowercase and strip whitespace)
for col in ['TITLE', 'COMPANY', 'DESCRIPTION', 'LOCATION']:
    df[col] = df[col].astype(str).str.strip().str.lower()

# Convert POSTED DATE to datetime
df['POSTED DATE'] = pd.to_datetime(df['POSTED DATE'], errors='coerce')

# Export cleaned data to CSV
df.to_csv('cleaned_jobs.csv', index=False)

print("Data cleaning complete. File saved as 'cleaned_jobs.csv'")
