In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os

# Path to the dataset directory
data_directory = '/content/drive/MyDrive/dataset'

# List the directories and a few files in each directory
for root, dirs, files in os.walk(data_directory, top=1):
    print("Root directory:", root)
    print("Subdirectories:", dirs)
    print("Files:", files[:5])  # print first 5 files to avoid too much output


TypeError: walk() got multiple values for argument 'top'

In [3]:
import os

# Path to the dataset directory
data_directory = '/content/drive/MyDrive/dataset'

# List the directories and a few files in each directory
for root, dirs, files in os.walk(data_directory):
    print("Root directory:", root)
    print("Subdirectories:", dirs)
    print("Files:", files[:5])  # print first 5 files to avoid too much output
    if root != data_directory:
        # To stop os.walk from going into subdirectories of the main directory
        dirs[:] = []  # Clear the dirs list


Root directory: /content/drive/MyDrive/dataset
Subdirectories: ['data', 'Resume']
Files: []
Root directory: /content/drive/MyDrive/dataset/data
Subdirectories: ['data']
Files: []
Root directory: /content/drive/MyDrive/dataset/Resume
Subdirectories: []
Files: ['Resume.csv']


In [4]:
import pandas as pd

# Path to the Resume.csv file
csv_file_path = '/content/drive/MyDrive/dataset/Resume/Resume.csv'

# Load the CSV file into a DataFrame
resume_df = pd.read_csv(csv_file_path)

# Display the first few rows of the dataframe
print(resume_df.head())


         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    # Tokenize and convert to lower case
    tokens = word_tokenize(text.lower())
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and word.isalnum()]
    return " ".join(filtered_tokens)

# Apply preprocessing to each resume text (assuming 'Resume_Text' is the column containing the text)
resume_df['cleaned_text'] = resume_df['Resume_Text'].apply(preprocess_text)

# Display the cleaned text
print(resume_df['cleaned_text'].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


KeyError: 'Resume_Text'

In [6]:
# Load the CSV file into a DataFrame
resume_df = pd.read_csv(csv_file_path)

# Display the column names
print("Column names in the CSV file:", resume_df.columns.tolist())


Column names in the CSV file: ['ID', 'Resume_str', 'Resume_html', 'Category']


In [7]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

# Path to the Resume.csv file
csv_file_path = '/content/drive/MyDrive/dataset/Resume/Resume.csv'

# Load the CSV file into a DataFrame
resume_df = pd.read_csv(csv_file_path)

# Function to preprocess text
def preprocess_text(text):
    # Tokenize and convert to lower case
    tokens = word_tokenize(text.lower())
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and word.isalnum()]
    return " ".join(filtered_tokens)

# Apply preprocessing to each resume text
resume_df['cleaned_text'] = resume_df['Resume_str'].apply(preprocess_text)

# Display the cleaned text
print(resume_df['cleaned_text'].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    hr associate hr administrator summary dedicate...
1    hr specialist us hr operations summary versati...
2    hr director summary 20 years experience recrui...
3    hr specialist summary dedicated driven dynamic...
4    hr manager skill highlights hr skills hr depar...
Name: cleaned_text, dtype: object


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Extracting features using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(resume_df['cleaned_text'])
y = resume_df['Category']  # Use the 'Category' column for labels

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a RandomForest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))


                        precision    recall  f1-score   support

            ACCOUNTANT       0.81      0.95      0.88        37
              ADVOCATE       0.86      0.84      0.85        37
           AGRICULTURE       0.25      0.07      0.11        15
               APPAREL       0.33      0.14      0.20        29
                  ARTS       0.31      0.17      0.22        29
            AUTOMOBILE       0.00      0.00      0.00        11
              AVIATION       0.82      0.91      0.86        35
               BANKING       0.62      0.69      0.66        29
                   BPO       0.00      0.00      0.00         4
  BUSINESS-DEVELOPMENT       0.68      0.60      0.64        35
                  CHEF       0.85      0.82      0.84        40
          CONSTRUCTION       0.91      0.91      0.91        43
            CONSULTANT       0.76      0.48      0.59        33
              DESIGNER       0.88      0.85      0.86        33
         DIGITAL-MEDIA       0.70      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
print(classification_report(y_test, predictions, zero_division=0))  # Handling undefined metric warning by setting zero_division to 0


                        precision    recall  f1-score   support

            ACCOUNTANT       0.81      0.95      0.88        37
              ADVOCATE       0.86      0.84      0.85        37
           AGRICULTURE       0.25      0.07      0.11        15
               APPAREL       0.33      0.14      0.20        29
                  ARTS       0.31      0.17      0.22        29
            AUTOMOBILE       0.00      0.00      0.00        11
              AVIATION       0.82      0.91      0.86        35
               BANKING       0.62      0.69      0.66        29
                   BPO       0.00      0.00      0.00         4
  BUSINESS-DEVELOPMENT       0.68      0.60      0.64        35
                  CHEF       0.85      0.82      0.84        40
          CONSTRUCTION       0.91      0.91      0.91        43
            CONSULTANT       0.76      0.48      0.59        33
              DESIGNER       0.88      0.85      0.86        33
         DIGITAL-MEDIA       0.70      

In [10]:
import pandas as pd

# Assuming `predictions` is your model's output on X_test and `X_test` was created from `resume_df` with a train_test_split
# Retrieve the original indices of your test set resumes
test_indices = X_test.index

# Create a DataFrame with IDs and their predicted categories
results_df = pd.DataFrame({
    'ID': resume_df.loc[test_indices, 'ID'],  # Assuming 'ID' is the identifier column in your original DataFrame
    'Category': predictions
})

# Save the DataFrame to a CSV file
results_df.to_csv('/content/drive/MyDrive/dataset/categorized_resumes.csv', index=False)

print("Sample output CSV file created and saved!")




AttributeError: 'csr_matrix' object has no attribute 'index'

In [11]:
from sklearn.model_selection import train_test_split

# Assuming resume_df['cleaned_text'] is your preprocessed text column
# Splitting the data into training and testing sets while preserving the DataFrame structure
train_df, test_df = train_test_split(resume_df, test_size=0.3, random_state=42)

# Extract features
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(train_df['cleaned_text'])
X_test = vectorizer.transform(test_df['cleaned_text'])
y_train = train_df['Category']
y_test = test_df['Category']


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Training a RandomForest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Making predictions
predictions = model.predict(X_test)

# Creating a DataFrame to match IDs with their predicted categories
results_df = pd.DataFrame({
    'ID': test_df['ID'],  # 'ID' column from test_df
    'Category': predictions
})

# Save to CSV
results_df.to_csv('/content/drive/MyDrive/dataset/categorized_resumes.csv', index=False)
print("Sample output CSV file created and saved!")


Sample output CSV file created and saved!


In [13]:
from google.colab import files

# Path to the file you want to download
file_path = '/content/drive/MyDrive/dataset/categorized_resumes.csv'

# Trigger the download
files.download(file_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>