In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
!pip install matplotlib

Collecting matplotlib
  Using cached matplotlib-3.10.6-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.3-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.60.1-cp313-cp313-win_amd64.whl.metadata (114 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.9-cp313-cp313-win_amd64.whl.metadata (6.4 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.5-py3-none-any.whl.metadata (5.0 kB)
Using cached matplotlib-3.10.6-cp313-cp313-win_amd64.whl (8.1 MB)
Using cached contourpy-1.3.3-cp313-cp313-win_amd64.whl (226 kB)
Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Downloading fonttools-4.60.1-cp313-cp313-win_amd64.whl (2.3 MB)
   ---------------------------------------- 0.0/2.3 MB ? eta -:--:--
   

In [4]:
df = pd.read_csv('ResumeDataset.csv')

In [5]:
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \nMay 2013 to May 2017 B.E ...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \n MCA YMCAUST, Faridabad..."


In [6]:
df.shape

(962, 2)

In [7]:
df

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \nMay 2013 to May 2017 B.E ...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \n MCA YMCAUST, Faridabad..."
...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...
958,Testing,â Willingness to accept the challenges. â ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...


In [8]:
df['Category'].value_counts()

Category
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Sales                        40
Data Science                 40
Mechanical Engineer          40
ETL Developer                40
Blockchain                   40
Operations Manager           40
Arts                         36
Database                     33
Health and fitness           30
PMO                          30
Electrical Engineering       30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
Civil Engineer               24
SAP Developer                24
Advocate                     20
Name: count, dtype: int64

In [9]:
df['Category']


0      Data Science
1      Data Science
2      Data Science
3      Data Science
4      Data Science
           ...     
957         Testing
958         Testing
959         Testing
960         Testing
961         Testing
Name: Category, Length: 962, dtype: object

In [10]:
import re
import string

def cleanResume(txt):
    # Convert to lowercase
    cleanText = txt.lower()
    # Remove URLs
    cleanText = re.sub(r'http\S+|www\S+', ' ', cleanText)
    
    # Remove emails
    cleanText = re.sub(r'\S+@\S+', ' ', cleanText)
    
    # Remove phone numbers (generic patterns)
    cleanText = re.sub(r'\b\d{10}\b', ' ', cleanText)  # 10-digit
    cleanText = re.sub(r'\+?\d[\d -]{8,}\d', ' ', cleanText)  # international formats
    
    # Remove HTML tags
    cleanText = re.sub(r'<.*?>', ' ', cleanText)
    
    # Remove RT, cc
    cleanText = re.sub(r'\brt\b|\bcc\b', ' ', cleanText)
    
    # Remove hashtags and mentions
    cleanText = re.sub(r'#\S+', ' ', cleanText)
    cleanText = re.sub(r'@\S+', ' ', cleanText)
    
    # Remove punctuation (safe way using string.punctuation)
    cleanText = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', cleanText)
    
    # Remove non-ASCII (emojis, etc.)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText)
    
    # Replace multiple spaces/newlines with single space
    cleanText = re.sub(r'\s+', ' ', cleanText).strip()
    return cleanText
    
df['Resume']=df['Resume'].apply(lambda x:cleanResume(x))

In [12]:
df

Unnamed: 0,Category,Resume
0,Data Science,skills programming languages python pandas num...
1,Data Science,education details may 2013 to may 2017 b e uit...
2,Data Science,areas of interest deep learning control system...
3,Data Science,skills r python sap hana tableau sap hana sql ...
4,Data Science,education details mca ymcaust faridabad haryan...
...,...,...
957,Testing,computer skills proficient in ms office word b...
958,Testing,willingness to accept the challenges positive ...
959,Testing,personal skills quick learner eagerness to lea...
960,Testing,computer skills software knowledge ms power po...


In [13]:
# import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [25]:
X = df['Resume']
y = df['Category']

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# print(y_train)

685                 PMO
13         Data Science
259               Sales
417    Business Analyst
354      Java Developer
             ...       
844    DotNet Developer
158       Web Designing
318      Civil Engineer
237               Sales
560    Python Developer
Name: Category, Length: 769, dtype: object


In [26]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [27]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

In [34]:
y_pred = model.predict(X_test_tfidf)
y_pred

array(['ETL Developer', 'Health and fitness', 'Advocate',
       'Automation Testing', 'Operations Manager', 'PMO',
       'Java Developer', 'Civil Engineer', 'Operations Manager',
       'Operations Manager', 'HR', 'Data Science', 'Java Developer',
       'DevOps Engineer', 'Blockchain', 'ETL Developer',
       'Mechanical Engineer', 'Testing', 'Data Science', 'Java Developer',
       'Python Developer', 'PMO', 'HR', 'SAP Developer',
       'DevOps Engineer', 'Data Science', 'Database', 'Database',
       'Network Security Engineer', 'HR', 'Arts', 'SAP Developer',
       'Sales', 'Civil Engineer', 'Sales', 'Network Security Engineer',
       'ETL Developer', 'HR', 'Java Developer', 'Java Developer', 'Sales',
       'Civil Engineer', 'Arts', 'Business Analyst', 'Testing',
       'Web Designing', 'DevOps Engineer', 'Data Science',
       'Java Developer', 'Java Developer', 'Electrical Engineering',
       'Testing', 'PMO', 'Health and fitness', 'DevOps Engineer',
       'Mechanical Engi

In [35]:
# print(accuracy_score(y_pred,y_test))
if(accuracy_score(y_pred,y_test) > 0.99):
    print("✅ Model trained successfully!")
else:
    print("Improve Model training")

✅ Model trained successfully!


In [37]:
import joblib
# Save model
joblib.dump(model, "resume_classifier_model.pkl")

# Save vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("✅ Model and Vectorizer saved successfully!")

✅ Model and Vectorizer saved successfully!
