In [None]:
# @title $install \ \ docx2txt$
# Installing the docx2txt library for extracting text from .docx files, and redirecting output to null to suppress installation messages
!pip install docx2txt &> /dev/null

In [None]:
# @title $install \ \ pypdf$
# Installing the pypdf library to work with PDF files
!pip install pypdf &> /dev/null

In [None]:
# @title $install \ \ doc2docx$
# Installing the doc2docx library to convert .doc files to .docx format
!pip install doc2docx &> /dev/null

In [None]:
# @title $install \ \ unoconv$
# Installing unoconv, a tool to convert between different document formats, using apt-get
!sudo apt-get install unoconv &> /dev/null

In [None]:
# @title $install \ \ python-docx$
# Installing the python-docx library to create, edit, and manipulate .docx files
!pip install python-docx &> /dev/null

In [None]:
#@title $Mount \ \  Drive$
import os    # Importing the os module for interacting with the operating system (e.g., file and directory management)
from google.colab import drive    # Importing drive from google.colab to mount Google Drive for accessing files in Colab

# Mounting Google Drive to the Colab environment for accessing files
drive.mount('/content/drive')

# Printing the current working directory to verify the file system location
print("Current working directory:", os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current working directory: /content


In [None]:
# @title $Path \ \ of \ \ folders \ \ in \ \ Google \ \ Drive$
# Defining the path to the folder in Google Drive where the resumes are stored
# Update the path to point to your specific folder in Google Drive
dir_path = r"/content/drive/My Drive/ExcelR_Projects/Project3/Resumes/"

# Listing all the files and folders in the specified directory
folders = os.listdir(dir_path)

# Displaying the list of files and folders
folders

['Peoplesoft Resumes', 'Workday Resumes', 'React Developers', 'SQL Developers']

In [None]:
# @title $Get \ \ Resumes \ \ data \ \ in \ \ Lists$
# @markdown $i) \ \ resume\_data$ \
# @markdown $ii) \ \ resume\_category$

#This code converts .doc files to .docx, extracts text from .docx and .pdf resumes,
#and organizes the text into lists with corresponding categories based on subdirectories.

import pandas as pd    # Importing the pandas library for data manipulation and analysis
import docx2txt    # Importing docx2txt to extract text from .docx files
import subprocess    # Importing subprocess to run external commands and processes from within Python scripts
from pypdf import PdfReader    # Importing PdfReader from the pypdf library to read PDF files
from glob import glob as glb    # This function is used for file pattern matching and retrieving file paths

# Function to convert .doc files to .docx format
def convert_to_docx(dir_path):
    files = glb('{}/*.doc'.format(dir_path))
    for file in files:
        subprocess.run(["unoconv", "-f", "docx", file])    # Convert .doc file to .docx using unoconv
        os.remove(file)     # Remove original .doc file

# Lists to store resume data and their corresponding categories
resume_data = []
resume_category = []

# Path to the main directory containing resume folders
main_dir_path = r'/content/drive/My Drive/ExcelR_Projects/Project3/Resumes/'

# Loop through each folder in the main directory
for folder in folders:
    sub_dir_path = os.path.join(main_dir_path, folder)
    print(sub_dir_path)    # Print the path of the current subdirectory

    convert_to_docx(sub_dir_path)    # Convert .doc files to .docx in the current subdirectory

    resumes = os.listdir(sub_dir_path)    # List all files in the subdirectory after conversion

    # Process each file in the subdirectory
    for resume in resumes:
        resume_path = os.path.join(sub_dir_path, resume)

        # If the file is a .docx
        if resume.endswith('.docx'):
            resume_data.append(docx2txt.process(resume_path))    # Extract text from .docx file and append to resume_data
            resume_category.append(folder)    # Append folder name to resume_category

        # If the file is a .pdf
        elif resume.endswith('.pdf'):
            pdf_reader = PdfReader(resume_path)    # Read the PDF file
            pdf_text = []

            # Extract text from each page of the PDF
            for page_num in range(len(pdf_reader.pages)):
                pdf_text.append(pdf_reader.pages[page_num].extract_text())

            resume_data.append(pdf_text)    # Append extracted text to resume_data
            resume_category.append(folder)    # Append folder name to resume_category

        # Print any other file types encountered
        else:
            print(f"Unsupported file type: {resume}")

/content/drive/My Drive/ExcelR_Projects/Project3/Resumes/Peoplesoft Resumes
/content/drive/My Drive/ExcelR_Projects/Project3/Resumes/Workday Resumes
/content/drive/My Drive/ExcelR_Projects/Project3/Resumes/React Developers
/content/drive/My Drive/ExcelR_Projects/Project3/Resumes/SQL Developers


In [None]:
# @title $Create \ \ a \ \ dataframe \ \ containing \ \ \boldsymbol{reume\_data} \ \ and \ \ \boldsymbol{resume\_category}$
# Create a DataFrame using the pandas library
# 'resume_data' contains the data for each resume
# 'resume_category' holds the categories associated with each resume
data = pd.DataFrame({'Resume': resume_data, 'Category': resume_category})

display(data)

Unnamed: 0,Resume,Category
0,C O N T A C T :\n\n\n\nAddress: Manyata Tech P...,Peoplesoft Resumes
1,Anubhav Kumar Singh\t\t\n\n\n\nCore Competenci...,Peoplesoft Resumes
2,Priyabrata Hota\n\n\n\n\n\nCAREER OBJECTIVE\t\...,Peoplesoft Resumes
3,Tanna Sujatha \n\n\n\n\n\n\n\nOBJECTIVE\n\nSee...,Peoplesoft Resumes
4,Hari Narayana \t\t \n\n\t\n\n\n\n...,Peoplesoft Resumes
...,...,...
74,Resume\n\nName : Neeraj Mishra\n\n\n\nExp...,SQL Developers
75,Aradhana Tripathi\n\n\n\nCurrent Location: Gac...,SQL Developers
76,SQL SERVER DEVELOPER\n\n\n\n\n\nPriyanka L ...,SQL Developers
77, Hyderabad\n\n\nNazeer Basha\n\nSQL and Power...,SQL Developers


In [None]:
# @title $No. of \ \ resumes \ \ in \ \ different \ \ Categories$
# Count the number of occurrences of each unique value in the 'Category' column
# This provides a summary of how many resumes fall into each category
category_counts = data['Category'].value_counts()

# Print the counts to view the distribution of categories
print(category_counts)

Category
React Developers      24
Workday Resumes       21
Peoplesoft Resumes    20
SQL Developers        14
Name: count, dtype: int64


In [None]:
# @title $Save \ \ the \ \ dataframe \ \ \boldsymbol{data} \ \ to \ \ a \ \ CSV \ \ file \ \ named \ \ \boldsymbol{resume\_data.csv}$
#
# 'index=False' ensures that the DataFrame index is not written to the CSV file
resume_data = data.to_csv('resume_data.csv', index=False)