In [1]:
import pandas as pd
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ronick\AppData\Roaming\nltk_data...


True

In [3]:
data = pd.read_csv('resume-contents.csv')
data = data.dropna(subset=['Resume', 'Category'])
data.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    tokens = re.findall(r'\b[a-z]+\b', text.lower())
    cleaned = [
        lemmatizer.lemmatize(token)
        for token in tokens
        if token not in stop_words
    ]
    return ' '.join(cleaned)
data['cleaned_resume'] = data['Resume'].apply(preprocess_text)
data[['Resume', 'cleaned_resume', 'Category']].head()

Unnamed: 0,Resume,cleaned_resume,Category
0,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...,Data Science
1,Education Details \r\nMay 2013 to May 2017 B.E...,education detail may may b e uit rgpv data sci...,Data Science
2,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...,Data Science
3,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill r python sap hana tableau sap hana sql s...,Data Science
4,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad haryana...,Data Science


In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['cleaned_resume'])
y = data['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
reg = LogisticRegression(max_iter = 1000)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9948186528497409

Classification Report:
                            precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      0.93      0.96        14
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         7
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       1.00      1.00      1.00        12
       