In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("E:/College Project/UpdatedResumeDataSet.csv")

In [3]:
df.sample(5)

Unnamed: 0,Category,Resume
476,Automation Testing,"Technical Skills Summary I have completed ""COR..."
793,ETL Developer,Computer skills: - Yes. SQL knowledge-yes Unix...
480,Automation Testing,"Technical Skills Summary I have completed ""COR..."
153,Web Designing,Education Details \r\nJanuary 2016 B.Sc. Infor...
544,Operations Manager,KEY COMPETENCIES â¶Multi - Operations Managem...


In [4]:
import re

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to the "Resume" column
df['Cleaned_Resume'] = df['Resume'].apply(clean_text)

In [5]:
df.head()

Unnamed: 0,Category,Resume,Cleaned_Resume
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may to may be uitrgpv data s...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas of interest deep learning control system...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...


In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Function to remove stop words
def remove_stop_words(text):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Apply stop word removal
df['Resume_NoStopWords'] = df['Cleaned_Resume'].apply(remove_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mbnss\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(text):
    tokens = text.split()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization
df['Resume_Lemmatized'] = df['Resume_NoStopWords'].apply(lemmatize_text)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mbnss\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
df.head()

Unnamed: 0,Category,Resume,Cleaned_Resume,Resume_NoStopWords,Resume_Lemmatized
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...,skills programming languages python pandas num...,skill programming language python panda numpy ...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may to may be uitrgpv data s...,education details may may uitrgpv data scienti...,education detail may may uitrgpv data scientis...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas of interest deep learning control system...,areas interest deep learning control system de...,area interest deep learning control system des...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...,skills r python sap hana tableau sap hana sql ...,skill r python sap hana tableau sap hana sql s...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...,education details mca ymcaust faridabad haryan...,education detail mca ymcaust faridabad haryana...


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=300)  # Limit features for efficiency

# Fit and transform the lemmatized text
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Resume_Lemmatized'])

# Convert the TF-IDF matrix to a DataFrame for analysis
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


In [11]:
# Add category labels to the TF-IDF DataFrame
tfidf_df['Category'] = df['Category']

In [12]:
from sklearn.model_selection import train_test_split

X = tfidf_df.drop('Category', axis=1)  # Feature matrix
y = tfidf_df['Category']  # Labels

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train,y_train)

In [14]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 92.75%
Classification Report:
                           precision    recall  f1-score   support

                 Advocate       0.00      0.00      0.00         3
                     Arts       0.86      1.00      0.92         6
       Automation Testing       1.00      0.40      0.57         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      0.56      0.71         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      0.93      0.96        14
         DotNet Developer       1.00      0.60      0.75         5
            ETL Developer       1.00      1.00      1.00         7
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       0.79      0.92      0.85        12
                   Ha

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
import joblib

# Save the trained model
joblib.dump(model, 'resume_parser_model.joblib')


['resume_parser_model.joblib']

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Assuming 'data' is your preprocessed DataFrame and 'Resume_Lemmatized' contains the cleaned text data
tfidf_vectorizer = TfidfVectorizer(max_features=300)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Resume_Lemmatized'])

# Save the vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')


['tfidf_vectorizer.joblib']