In [11]:
import pandas as pd

# Load updated datasets
df1 = pd.read_csv("updated_filtered1_resumes.csv")
df2 = pd.read_csv("updated_filtered2_resumes.csv")
df3 = pd.read_csv("updated_filtered3_resumes.csv")
df4 = pd.read_csv("updated_filtered4_resumes.csv")

# Optional: print their shapes or a few rows to confirm
print("filtered1:", df1.shape)
print("filtered2:", df2.shape)
print("filtered3:", df3.shape)
print("filtered4:", df4.shape)

filtered1: (20938, 3)
filtered2: (211, 2)
filtered3: (226, 4)
filtered4: (266, 2)


In [3]:
df1.head()

Unnamed: 0,File_ID,Resume_Text,Normalized_Label
0,1,Database Administrator Database Administrator ...,Database Administrator
1,2,Database Administrator Database Administrator ...,Database Administrator
2,3,Oracle Database Administrator Oracle Database ...,Database Administrator
3,4,Amazon Redshift Administrator and ETL Develope...,Database Administrator
4,5,Scrum Master Scrum Master Scrum Master Richmon...,Database Administrator


In [5]:
df2.head()

Unnamed: 0,Category,Resume
0,Web Developer,"As a seasoned Frontend Developer, I have a pro..."
1,Software Developer,With a solid background in Backend Development...
2,Data Scientist,"With a background in Data Science, I possess a..."
3,Web Developer,Experienced Frontend Developer with a passion ...
4,Web Developer,Passionate Frontend Developer with over 4 year...


In [7]:
df3.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR Manager
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR Manager
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR Manager
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR Manager
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR Manager


In [13]:
df4.head()

Unnamed: 0,Category,Resume
0,Data Scientist,Skills * Programming Languages: Python (pandas...
1,Data Scientist,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Scientist,"Areas of Interest Deep Learning, Control Syste..."
3,Data Scientist,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Scientist,"Education Details \r\n MCA YMCAUST, Faridab..."


In [15]:
# Rename columns
df1 = df1.rename(columns={
    "Normalized_Label": "Category",
    "Resume_Text": "Resume",
    "File_ID": "ID"
})

# Optional: check the updated column names
print(df1.columns)

# (Optional) Save to a new CSV file
df1.to_csv("final_filtered1_resumes.csv", index=False)


Index(['ID', 'Resume', 'Category'], dtype='object')


In [17]:
df5 = pd.read_csv("final_filtered1_resumes.csv")
df5.head()

Unnamed: 0,ID,Resume,Category
0,1,Database Administrator Database Administrator ...,Database Administrator
1,2,Database Administrator Database Administrator ...,Database Administrator
2,3,Oracle Database Administrator Oracle Database ...,Database Administrator
3,4,Amazon Redshift Administrator and ETL Develope...,Database Administrator
4,5,Scrum Master Scrum Master Scrum Master Richmon...,Database Administrator


In [21]:
import pandas as pd

# Load the dataset
df6 = pd.read_csv("updated_filtered2_resumes.csv")

# Reorder columns: Resume first, then Category
df6 = df6[["Resume", "Category"]]

# (Optional) Save the updated file
df6.to_csv("final_filtered2_resumes.csv", index=False)

# Confirm column order
print(df6.columns)


Index(['Resume', 'Category'], dtype='object')


In [23]:
# Drop unwanted columns
df3 = df3.drop(columns=["ID", "Resume_html"])

# Rename Resume_str to Resume
df3 = df3.rename(columns={"Resume_str": "Resume"})

# (Optional) Save the updated DataFrame
df3.to_csv("final_filtered3_resumes.csv", index=False)

# Confirm changes
print(df3.columns)


Index(['Resume', 'Category'], dtype='object')


In [25]:
import pandas as pd
import re

# Load the dataset
df4 = pd.read_csv("updated_filtered4_resumes.csv")

# Reorder columns
df4 = df4[["Resume", "Category"]]

# Function to clean resume text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Remove non-printable characters, symbols, extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s.,!?;:\'\"()\[\]\-\/&]', ' ', text)  # keep common punctuation
    text = re.sub(r'\s+', ' ', text)  # replace multiple spaces/newlines with single space
    return text.strip()

# Apply cleaning to Resume column
df4["Resume"] = df4["Resume"].apply(clean_text)

# (Optional) Save cleaned version
df4.to_csv("final_filtered4_resumes.csv", index=False)

# Preview cleaned data
print(df4.head())


                                              Resume        Category
0  Skills Programming Languages: Python (pandas, ...  Data Scientist
1  Education Details May 2013 to May 2017 B.E UIT...  Data Scientist
2  Areas of Interest Deep Learning, Control Syste...  Data Scientist
3  Skills R Python SAP HANA Tableau SAP HANA SQL ...  Data Scientist
4  Education Details MCA YMCAUST, Faridabad, Hary...  Data Scientist


In [27]:
df4.head()

Unnamed: 0,Resume,Category
0,"Skills Programming Languages: Python (pandas, ...",Data Scientist
1,Education Details May 2013 to May 2017 B.E UIT...,Data Scientist
2,"Areas of Interest Deep Learning, Control Syste...",Data Scientist
3,Skills R Python SAP HANA Tableau SAP HANA SQL ...,Data Scientist
4,"Education Details MCA YMCAUST, Faridabad, Hary...",Data Scientist


In [1]:
import pandas as pd

# Load all datasets
df1 = pd.read_csv("final_filtered1_resumes.csv")  # already has ID
df2 = pd.read_csv("final_filtered2_resumes.csv")
df3 = pd.read_csv("final_filtered3_resumes.csv")
df4 = pd.read_csv("final_filtered4_resumes.csv")

# Add ID to datasets that don’t have it, continuing from the last ID in df1
start_id = df1["ID"].max() + 1 if "ID" in df1.columns else 0

df2["ID"] = range(start_id, start_id + len(df2))
start_id += len(df2)

df3["ID"] = range(start_id, start_id + len(df3))
start_id += len(df3)

df4["ID"] = range(start_id, start_id + len(df4))

# Reorder columns to ID, Resume, Category
df1 = df1[["ID", "Resume", "Category"]]
df2 = df2[["ID", "Resume", "Category"]]
df3 = df3[["ID", "Resume", "Category"]]
df4 = df4[["ID", "Resume", "Category"]]

# Merge all into one DataFrame
merged_df = pd.concat([df1, df2, df3, df4], ignore_index=True)

# Save to final CSV
merged_df.to_csv("all_resumes_merged.csv", index=False)

# Show a preview
print(merged_df.head())
print(f"\nTotal records: {len(merged_df)}")


   ID                                             Resume  \
0   1  Database Administrator Database Administrator ...   
1   2  Database Administrator Database Administrator ...   
2   3  Oracle Database Administrator Oracle Database ...   
3   4  Amazon Redshift Administrator and ETL Develope...   
4   5  Scrum Master Scrum Master Scrum Master Richmon...   

                 Category  
0  Database Administrator  
1  Database Administrator  
2  Database Administrator  
3  Database Administrator  
4  Database Administrator  

Total records: 21641


In [3]:
import pandas as pd

# Load the merged dataset
df = pd.read_csv("all_resumes_merged.csv")

# Print unique categories with their counts
category_counts = df["Category"].value_counts()
print(category_counts)


Category
Software Developer        5885
Systems Administrator     4182
Web Developer             3567
Database Administrator    2817
Java Developer            2502
Network Administrator     2285
Sales Manager              156
HR Manager                 154
Data Scientist              93
Name: count, dtype: int64


In [5]:
df.head()

Unnamed: 0,ID,Resume,Category
0,1,Database Administrator Database Administrator ...,Database Administrator
1,2,Database Administrator Database Administrator ...,Database Administrator
2,3,Oracle Database Administrator Oracle Database ...,Database Administrator
3,4,Amazon Redshift Administrator and ETL Develope...,Database Administrator
4,5,Scrum Master Scrum Master Scrum Master Richmon...,Database Administrator


In [1]:
import pandas as pd

In [45]:
df = pd.read_csv("all_resumes_merged.csv")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 110073: invalid start byte

In [5]:
df.head()

Unnamed: 0,ID,Resume,Category
0,1,Database Administrator Database Administrator ...,Database Administrator
1,2,Database Administrator Database Administrator ...,Database Administrator
2,3,Oracle Database Administrator Oracle Database ...,Database Administrator
3,4,Amazon Redshift Administrator and ETL Develope...,Database Administrator
4,5,Scrum Master Scrum Master Scrum Master Richmon...,Database Administrator


In [19]:
book = pd.read_csv("Book2.csv", encoding='ISO-8859-1')
book.head()

Unnamed: 0,Resume,Category
0,"Data Scientist  EduPredict Innovations, Londo...",
1,"Data Scientist  AutoManufacture Labs, Stuttga...",
2,"Data Scientist  ShopTrend AI, San Francisco, ...",
3,"Data Scientist  ManuTech Analytics, Detroit, ...",
4,"Data Scientist  GreenManufacture AI, Toronto,...",


In [21]:
# Step 2: Fill 'Category' column with "Data Scientist"
book['Category'] = 'Data Scientist'

# Step 3: Clean 'Resume' column
book['Resume'] = book['Resume'].str.replace(r'[^\x00-\x7F]+', ' ', regex=True)  # Remove non-ASCII
book['Resume'] = book['Resume'].str.strip()  # Remove leading/trailing whitespace

# Step 4: Save to a new UTF-8 encoded CSV file
book.to_csv("Cleaned_Book2.csv", index=False, encoding='utf-8')


In [23]:
Cleaned_Book2 = pd.read_csv("Cleaned_Book2.csv")
Cleaned_Book2.head()

Unnamed: 0,Resume,Category
0,"Data Scientist EduPredict Innovations, Londo...",Data Scientist
1,"Data Scientist AutoManufacture Labs, Stuttga...",Data Scientist
2,"Data Scientist ShopTrend AI, San Francisco, ...",Data Scientist
3,"Data Scientist ManuTech Analytics, Detroit, ...",Data Scientist
4,"Data Scientist GreenManufacture AI, Toronto,...",Data Scientist


In [17]:
Dataset = pd.read_csv("Dataset_TalentMatch(DS).csv", encoding='ISO-8859-1')
Dataset.head()

Unnamed: 0,Resume,Category
0,Data Scientist  ClimateRisk AI\nProfessional ...,
1,Data Scientist  SaharaCrop AI\nProfessional S...,
2,Data Scientist  OrbitGuard AI\nProfessional S...,
3,Data Scientist  EverestFlow AI\nProfessional ...,
4,Data Scientist  ReefGuard AI\nProfessional Su...,


In [25]:
# Step 1: Fill 'Category' column with "Data Scientist"
Dataset['Category'] = 'Data Scientist'

# Step 2: Clean 'Resume' column
Dataset['Resume'] = Dataset['Resume'].str.replace(r'[^\x00-\x7F]+', ' ', regex=True)  # Remove non-ASCII characters
Dataset['Resume'] = Dataset['Resume'].str.strip()  # Remove leading/trailing spaces

# Step 3: Save to a new file with UTF-8 encoding
Dataset.to_csv("Cleaned_Dataset_TalentMatch.csv", index=False, encoding='utf-8')


In [27]:
TalentMatch = pd.read_csv("Cleaned_Dataset_TalentMatch.csv")
TalentMatch.head()

Unnamed: 0,Resume,Category
0,Data Scientist ClimateRisk AI\nProfessional ...,Data Scientist
1,Data Scientist SaharaCrop AI\nProfessional S...,Data Scientist
2,Data Scientist OrbitGuard AI\nProfessional S...,Data Scientist
3,Data Scientist EverestFlow AI\nProfessional ...,Data Scientist
4,Data Scientist ReefGuard AI\nProfessional Su...,Data Scientist


In [29]:
cleaned = pd.read_csv("updated_data_final_cleaned.csv")
cleaned.head()

Unnamed: 0,instruction,input,Resume_test
0,Generate a Resume for a Accountant Job,,ACCOUNTANT Professional Summary Results orient...
1,Generate a Resume for a Accountant Job,,STAFF ACCOUNTANT Summary Flexible Accountant w...
2,Generate a Resume for a Accountant Job,,STAFF ACCOUNTANT Summary Highly analytical and...
3,Generate a Resume for a Accountant Job,,SENIOR ACCOUNTANT Summary A highly competent m...
4,Generate a Resume for a Accountant Job,,SENIOR ACCOUNTANT Summary 11 years experience ...


In [41]:
import pandas as pd

# Read with low_memory=False to suppress DtypeWarning
df_main = pd.read_csv("all_resumes_merged.csv", encoding='ISO-8859-1', low_memory=False)

# Convert 'ID' column to numeric, coercing errors (non-numeric values become NaN)
df_main['ID'] = pd.to_numeric(df_main['ID'], errors='coerce')

# Drop NaNs if any, or fill with 0 (depending on what you prefer)
df_main['ID'] = df_main['ID'].fillna(0).astype(int)

# Now safely get max ID
start_id = df_main['ID'].max() + 1


In [43]:
import pandas as pd

# Step 1: Load your main file
df_main = pd.read_csv("all_resumes_merged.csv", encoding='ISO-8859-1')

# Step 2: Load the two new cleaned files
df_book2 = pd.read_csv("Cleaned_Book2.csv")
df_talent = pd.read_csv("Cleaned_Dataset_TalentMatch.csv")

# Step 3: Concatenate both new files
df_new = pd.concat([df_book2, df_talent], ignore_index=True)

# Step 4: Generate new IDs continuing from the last ID in the main file
start_id = df_main['ID'].max() + 1
df_new.insert(0, 'ID', range(start_id, start_id + len(df_new)))

# Step 5: Append new data to the main file
df_combined = pd.concat([df_main, df_new], ignore_index=True)

# Step 6: Save the final merged dataset
df_combined.to_csv("all_resumes_merged_updated.csv", index=False)

  df_main = pd.read_csv("all_resumes_merged.csv", encoding='ISO-8859-1')


TypeError: '>=' not supported between instances of 'int' and 'str'