In [1]:
import os
import csv
import pandas as pd

In [5]:
def txt_files_to_csv(input_folder, output_path):

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)

        writer.writerow(['filename', 'content'])
        
        for filename in os.listdir(input_folder):
            if filename.endswith('.txt'):
                file_path = os.path.join(input_folder, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as txtfile:
                        content = txtfile.read()
                        writer.writerow([filename, content])
                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")
    
    print(f"CSV file created successfully at {output_path}")

In [6]:
input_folder = os.path.join('..','data', 'raw', 'synthetic_data')
output_path = os.path.join('..', 'data', 'processed', 'synthetic_dataset.csv')
txt_files_to_csv(input_folder, output_path)

CSV file created successfully at ..\data\processed\synthetic_dataset.csv


Changing Column Names

In [2]:
df = pd.read_csv(r'D:\Resume Classification System\data\processed\synthetic_dataset.csv')

In [8]:
df.head()

Unnamed: 0,filename,content
0,AI_Engineer_Resume_1.txt,Name: Eleanor Vance\n\nSummary:\nHighly motiva...
1,AI_Engineer_Resume_10.txt,JOHN SMITH\n(123) 456-7890 | john.smith@email....
2,AI_Engineer_Resume_100.txt,[Name] Anya Sharma\n[Phone] (555) 123-4567\n[E...
3,AI_Engineer_Resume_11.txt,Jane Doe\n(123) 456-7890 | jane.doe@email.com ...
4,AI_Engineer_Resume_12.txt,Jane Doe\n(123) 456-7890 | jane.doe@email.com ...


In [9]:
df.tail()

Unnamed: 0,filename,content
512,resume_75.txt,JANE DOE\n(123) 456-7890 | jane.doe@email.com ...
513,resume_76.txt,JOHN DOE\n(555) 123-4567 | john.doe@email.com ...
514,resume_77.txt,JOHN DOE\n(123) 456-7890 | john.doe@email.com ...
515,resume_8.txt,NAME: Jane Doe\nPHONE: (555) 123-4567\nEMAIL: ...
516,resume_9.txt,JANE DOE\n(123) 456-7890 | jane.doe@email.com ...


In [10]:
header = ['Category','Text']
df.columns = header

In [11]:
df.head()

Unnamed: 0,Category,Text
0,AI_Engineer_Resume_1.txt,Name: Eleanor Vance\n\nSummary:\nHighly motiva...
1,AI_Engineer_Resume_10.txt,JOHN SMITH\n(123) 456-7890 | john.smith@email....
2,AI_Engineer_Resume_100.txt,[Name] Anya Sharma\n[Phone] (555) 123-4567\n[E...
3,AI_Engineer_Resume_11.txt,Jane Doe\n(123) 456-7890 | jane.doe@email.com ...
4,AI_Engineer_Resume_12.txt,Jane Doe\n(123) 456-7890 | jane.doe@email.com ...


In [12]:
df.to_csv(r'D:\Resume Classification System\data\processed\synthetic_dataset.csv')

In [18]:
category_mapping = {
    'AI_Engineer_Resume_': 'AI Engineer',
    'Computer_Vision_Engineer_Resume_': 'Computer Vision Engineer',
    'Data_Analyst_Resume_': 'Data Analyst',
    'resume_': 'Data Analyst',
    'Data_Engineer_Resume_': 'Data Engineer',
    'NLP_Engineer_Resume_': 'NLP Engineer'
}

In [19]:
def clean_category(text):
    for pattern, category in category_mapping.items():
        if str(text).startswith(pattern):
            return category
    return text

In [20]:
df['Category'] = df['Category'].apply(clean_category)

In [21]:
df.head()

Unnamed: 0,Category,Text
0,AI Engineer,Name: Eleanor Vance\n\nSummary:\nHighly motiva...
1,AI Engineer,JOHN SMITH\n(123) 456-7890 | john.smith@email....
2,AI Engineer,[Name] Anya Sharma\n[Phone] (555) 123-4567\n[E...
3,AI Engineer,Jane Doe\n(123) 456-7890 | jane.doe@email.com ...
4,AI Engineer,Jane Doe\n(123) 456-7890 | jane.doe@email.com ...


In [22]:
categories = df['Category'].unique()
print(categories)

['AI Engineer' 'Computer Vision Engineer' 'Data Analyst' 'Data Engineer'
 'NLP Engineer']


In [23]:
df.to_csv(r'D:\Resume Classification System\data\processed\synthetic_dataset.csv')

In [3]:
df.isnull().sum()

Unnamed: 0    0
Category      0
Text          0
dtype: int64