In [19]:
import os
import csv
import pandas as pd
import re

In [20]:
def txt_files_to_csv(input_folder, output_path):

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)

        writer.writerow(['Category', 'Text'])
        
        for filename in os.listdir(input_folder):
            if filename.endswith('.txt'):
                file_path = os.path.join(input_folder, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as txtfile:
                        content = txtfile.read()
                        writer.writerow([filename, content])
                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")
    
    print(f"CSV file created successfully at {output_path}")

In [21]:
input_folder = os.path.join('..','data', 'raw', 'github_resumes')
output_path = os.path.join('..', 'data', 'processed', 'github_dataset.csv')
txt_files_to_csv(input_folder, output_path)

CSV file created successfully at ..\data\processed\github_dataset.csv


In [22]:
df1 = pd.read_csv(r'D:\Resume Classification System\data\processed\github_dataset.csv')
df1.head()

Unnamed: 0,Category,Text
0,artificialintelligenceresume264.txt,"Role: AI Engineer\nSkills: Deep Learning, NLP,..."
1,artificialintelligenceresume265.txt,Abhishek kumar\nartificial intelligence and ma...
2,azranmelodymodlingreadme.txt,"Building OpenCV from Source, using CMake and C..."
3,azranmelody_modling_readme.txt,"Building OpenCV from Source, using CMake and C..."
4,bassimeledathtest-llm-structured-outputsenior-...,Terrence Coleman\ntcoleman@email.com (123) 456...


In [23]:
def categorize_filename(filename):

    filename_lower = str(filename).lower()
    
    if filename_lower.startswith('artificialintelligence'):
        return 'AI Engineer'
    elif filename_lower.startswith('dataanalyst'):
        return 'Data Analyst'
    elif filename_lower.startswith('datascientist'):
        return 'Data Scientist'
    elif filename_lower.startswith('machinelearning'):
        return 'ML Engineer'
    elif '-data-scientist-resume-' in filename_lower or 'data_scientist' in filename_lower:
        return 'Data Scientist'
    else:
        return 'Other'

In [24]:
df1["Category"] = df1['Category'].apply(categorize_filename)

In [25]:
categories = df1['Category'].unique()
print(categories)

['AI Engineer' 'Other' 'Data Scientist' 'Data Analyst' 'ML Engineer']


In [26]:
count_of_category = df1['Category'].value_counts()
print(count_of_category)

Category
Data Scientist    149
ML Engineer       112
Other              16
Data Analyst       14
AI Engineer         2
Name: count, dtype: int64


In [10]:
df1.to_csv(r'D:\Resume Classification System\data\processed\github_categorized_dataset.csv', index=False)

In [12]:
header = ["Category", "Text"]
df2 = pd.read_csv(r'D:\Resume Classification System\data\processed\github_categorized_dataset.csv')
df2.columns = header

In [16]:
df2.head()

Unnamed: 0,Category,Text
0,AI Engineer,"Role: AI Engineer\nSkills: Deep Learning, NLP,..."
2,Other,"Building OpenCV from Source, using CMake and C..."
3,Other,"Building OpenCV from Source, using CMake and C..."
4,Data Scientist,Terrence Coleman\ntcoleman@email.com (123) 456...
5,Data Scientist,Terrence Coleman\ntcoleman@email.com (123) 456...


In [17]:
df2.tail()

Unnamed: 0,Category,Text
288,Other,"\documentclass[letterpaper,10pt]{article}\n\us..."
289,Other,SUMMARY Around 5 years of professional experie...
290,Other,SUMMARY 8+ years of experience in Data Science...
291,Other,SUMMARY Around 5 years of professional experie...
292,Other,SUMMARY 8+ years of experience in Data Science...


In [18]:
categories1 = df2['Category'].unique()
values_in_category = df2['Category'].value_counts()
print(categories1)
print(values_in_category)

['AI Engineer' 'Other' 'Data Scientist' 'Data Analyst' 'ML Engineer']
Category
Data Scientist    149
ML Engineer       112
Other              16
Data Analyst       14
AI Engineer         1
Name: count, dtype: int64


In [None]:
df2.isnull().sum()

Category    0
Text        0
dtype: int64

In [27]:
dff =pd.read_csv(r'D:\Resume Classification System\data\processed\github_categorized.csv')

In [28]:
dff.head()

Unnamed: 0,Category,Text
0,AI Engineer,Abhishek kumar\nartificial intelligence and ma...
1,Category,Text
2,Data Analyst,DATA ANALYST F/H\n\nArailym PERNEBAY\n\nperneb...
3,Data Analyst,# CV 4: Cybersecurity Analyst\n**Name**: Emily...
4,Data Analyst,Name: Ravi Kumar\nContact: ravi.k@email.com | ...


In [29]:
dff.tail()

Unnamed: 0,Category,Text
125,ML Engineer,Objective:\nDedicated and driven legal profess...
126,ML Engineer,Objective:\nCompassionate and dedicated health...
127,ML Engineer,Objective:\nDedicated and results-oriented Ban...
128,Data Scientist,SUMMARY Around 5 years of professional experie...
129,Data Scientist,SUMMARY 8+ years of experience in Data Science...


In [32]:
dff['Category'].unique()

array(['AI Engineer', 'Category', 'Data Analyst', 'Data Scientist',
       'ML Engineer'], dtype=object)

In [33]:
dff = dff.drop(index=1)

In [34]:
dff.head()

Unnamed: 0,Category,Text
0,AI Engineer,Abhishek kumar\nartificial intelligence and ma...
2,Data Analyst,DATA ANALYST F/H\n\nArailym PERNEBAY\n\nperneb...
3,Data Analyst,# CV 4: Cybersecurity Analyst\n**Name**: Emily...
4,Data Analyst,Name: Ravi Kumar\nContact: ravi.k@email.com | ...
5,Data Analyst,Title: Top Analyst Reports for Bank of America...


In [35]:
dff.to_csv(r'D:\Resume Classification System\data\processed\github_categorized.csv')