In [None]:
#Common cleaning steps (duplicates, nulls)

# 1. Drop exact duplicate rows
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print("Dropped duplicates:", before-after)

# 2. Basic fill / drop strategy
# If a field is essential (e.g., resume text), drop rows without it
essential_cols = ['Resume'] if 'Resume' in df.columns else [c for c in df.columns if 'resume' in c.lower()]
print("Essential columns:", essential_cols)
df = df.dropna(subset=essential_cols)
# 3. Trim whitespace
for c in df.select_dtypes(include='object').columns:
    df[c] = df[c].astype(str).str.strip()


In [None]:
# Normalize column names

df.columns = [re.sub(r'\s+','_',c.strip()).lower() for c in df.columns]
df.rename(columns=lambda x: x.lower(), inplace=True)
df.columns.tolist()


In [None]:
# Parse/clean resume text (lowercase, remove emails, urls, phone)
email_re = re.compile(r'\S+@\S+')
phone_re = re.compile(r'(\+?\d[\d\-\s]{7,}\d)')
url_re = re.compile(r'http\S+|www\.\S+')

def clean_text(text):
    text = str(text)
    text = text.replace('\r',' ').replace('\n',' ')
    text = email_re.sub(' ', text)
    text = phone_re.sub(' ', text)
    text = url_re.sub(' ', text)
    text = re.sub(r'[^A-Za-z0-9\.\,\s\(\)\-\+]', ' ', text)  # allow punctuation
    text = re.sub(r'\s{2,}',' ', text)
    return text.strip().lower()

resume_col = None
for c in df.columns:
    if 'resume' in c:
        resume_col = c
        break
if resume_col is None:
    raise ValueError("Cannot find resume text column in dataset.")
df['resume_clean'] = df[resume_col].apply(clean_text)
df['resume_clean'].head()
