In [11]:
import pandas as pd

In [15]:
# Step 9: Verify the saved cleaned dataset
Resume = pd.read_csv('resumes_filtered_deduplicated_utf8.csv')
print("\nSaved cleaned file:")
print("Columns:", Resume.columns)
print("Total rows:", len(Resume))
print("Unique Categories and Their Counts:")
print(Resume['Category'].value_counts())


Saved cleaned file:
Columns: Index(['Resume', 'Category'], dtype='object')
Total rows: 22386
Unique Categories and Their Counts:
Category
Software Developer        5668
Systems Administrator     3992
Web Developer             3422
Database Administrator    2766
Java Developer            2350
Network Administrator     2188
Data Scientist            2000
Name: count, dtype: int64


In [21]:
# Step 9: Verify the saved cleaned dataset
JD = pd.read_csv('filtered_job_descriptions.csv')
print("\nSaved cleaned file:")
print("Columns:", JD.columns)
print("Total rows:", len(JD))
print("Unique Categories and Their Counts:")
print(JD['Job Title'].value_counts())


Saved cleaned file:
Columns: Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')
Total rows: 97371
Unique Categories and Their Counts:
Job Title
Network Administrator     17581
Systems Administrator     17281
Java Developer            10589
Sales Manager             10417
HR Manager                10405
Web Developer             10361
Database Administrator    10280
Data Scientist             6940
Software Developer         3517
Name: count, dtype: int64


In [23]:
import pandas as pd
import re

In [25]:
# Load JD dataset
jd_df = pd.read_csv("filtered_job_descriptions.csv")

In [27]:
# ✅ 1. Drop unwanted categories
jd_df = jd_df[~jd_df["Job Title"].isin(["Sales Manager", "HR Manager"])]

In [29]:
# ✅ 2. Merge JD fields for rich input
for col in ["Job Description", "skills", "Responsibilities", "Qualifications", "Benefits"]:
    jd_df[col] = jd_df[col].fillna("")

jd_df["JD_Full"] = (
    jd_df["Job Title"] + " " +
    jd_df["Job Description"] + " " +
    jd_df["skills"] + " " +
    jd_df["Responsibilities"] + " " +
    jd_df["Qualifications"] + " " +
    jd_df["Benefits"]
).str.strip()

# ✅ 3. Clean JD text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

jd_df["JD_Full"] = jd_df["JD_Full"].apply(clean_text)

In [31]:
# ✅ 4. Rename and keep only required columns
jd_df = jd_df[["JD_Full", "Job Title"]].rename(columns={
    "JD_Full": "Job_Description",
    "Job Title": "Category"
}).reset_index(drop=True)

In [33]:
# ✅ 5. Save cleaned JD file
jd_df.to_csv("cleaned_jds_filtered.csv", index=False, encoding="utf-8")
print("✅ Cleaned and saved to 'cleaned_jds_filtered.csv'")

✅ Cleaned and saved to 'cleaned_jds_filtered.csv'


In [35]:
JD = pd.read_csv("cleaned_jds_filtered.csv")
JD.head()

Unnamed: 0,Job_Description,Category
0,web developer frontend web developers design a...,Web Developer
1,network administrator protect an organizations...,Network Administrator
2,systems administrator it support specialists p...,Systems Administrator
3,database administrator a database developer de...,Database Administrator
4,software developer mobile app developers desig...,Software Developer


In [51]:
print("\nSaved cleaned file:")
print("Columns:", JD.columns)
print("Total rows:", len(JD))
print("Unique Categories and Their Counts:")
print(JD['Category'].value_counts())


Saved cleaned file:
Columns: Index(['Job_Description', 'Category'], dtype='object')
Total rows: 76549
Unique Categories and Their Counts:
Category
Network Administrator     17581
Systems Administrator     17281
Java Developer            10589
Web Developer             10361
Database Administrator    10280
Data Scientist             6940
Software Developer         3517
Name: count, dtype: int64


In [37]:
# Load Resume dataset
resume_df = pd.read_csv("resumes_filtered_deduplicated_utf8.csv")

In [39]:
# ✅ 1. Drop rows with missing or empty resumes
resume_df = resume_df.dropna(subset=["Resume"])
resume_df = resume_df[resume_df["Resume"].str.strip() != ""]

In [41]:
# ✅ 2. Clean resume text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

resume_df["Resume"] = resume_df["Resume"].apply(clean_text)

In [43]:
# ✅ 3. Rename and keep only required columns
resume_df = resume_df[["Resume", "Category"]].reset_index(drop=True)

In [45]:
# ✅ 4. Save cleaned resume file
resume_df.to_csv("cleaned_resumes.csv", index=False, encoding="utf-8")
print("✅ Cleaned and saved to 'cleaned_resumes.csv'")

✅ Cleaned and saved to 'cleaned_resumes.csv'


In [47]:
re = pd.read_csv("cleaned_resumes.csv")
re.head()

Unnamed: 0,Resume,Category
0,database administrator family private care llc...,Database Administrator
1,database administrator database administrator ...,Database Administrator
2,oracle database administrator oracle database ...,Database Administrator
3,amazon redshift administrator and etl develope...,Database Administrator
4,scrum master scrum master scrum master richmon...,Database Administrator


In [53]:
print("\nSaved cleaned file:")
print("Columns:", re.columns)
print("Total rows:", len(re))
print("Unique Categories and Their Counts:")
print(re['Category'].value_counts())


Saved cleaned file:
Columns: Index(['Resume', 'Category'], dtype='object')
Total rows: 22386
Unique Categories and Their Counts:
Category
Software Developer        5668
Systems Administrator     3992
Web Developer             3422
Database Administrator    2766
Java Developer            2350
Network Administrator     2188
Data Scientist            2000
Name: count, dtype: int64


In [55]:
print(re["Category"].value_counts())
print(JD["Category"].value_counts())

Category
Software Developer        5668
Systems Administrator     3992
Web Developer             3422
Database Administrator    2766
Java Developer            2350
Network Administrator     2188
Data Scientist            2000
Name: count, dtype: int64
Category
Network Administrator     17581
Systems Administrator     17281
Java Developer            10589
Web Developer             10361
Database Administrator    10280
Data Scientist             6940
Software Developer         3517
Name: count, dtype: int64
