# Resume Classification

In [None]:
import pandas as pd 
import numpy as np 
import re
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer

### 1. Load dataset

In [3]:
df = pd.read_csv("../datasets/UpdatedResumeDataSet.csv")
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


The project uses a CSV dataset of resumes with the following columns:

- `Resume` → The full text of the resume  
- `Category` → The job category label  

> Example: `Java Developer`, `Data Scientist`, `Web Developer`, etc.

### 2. Clean text

In [4]:
def cleanText(text: str) -> str:
    """
    Clean a string by:
    - Removing line breaks
    - Collapsing multiple spaces
    - Removing brackets: [], (), {}
    - Removing weird Unicode artifacts / non-ASCII characters
    - Stripping leading/trailing spaces
    """
    # 1. Remove line breaks
    text = re.sub(r'[\r\n]+', ' ', text)
    
    # 2. Collapse multiple spaces/tabs
    text = re.sub(r'\s+', ' ', text)
    
    # 3. Remove brackets
    text = re.sub(r'[\[\]\(\)\{\}]', '', text)
    
    # 4. Remove non-ASCII / Unicode characters
    text = re.sub(r'[^\x20-\x7E]', '', text)
    
    # 5. Strip leading/trailing spaces
    text = text.strip()
    
    return text


df['Clean_Text'] = df['Resume'].apply(cleanText)
df.head()

Unnamed: 0,Category,Resume,Clean_Text
0,Data Science,Skills * Programming Languages: Python (pandas...,"Skills * Programming Languages: Python pandas,..."
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,Education Details May 2013 to May 2017 B.E UIT...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...","Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,Skills R Python SAP HANA Tableau SAP HANA...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...","Education Details MCA YMCAUST, Faridabad, Hary..."


### 3. Train/Test Split

In [5]:
X = df['Clean_Text'].values
y = df['Category'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0, test_size=0.2)

### 4. Train Model

In [6]:
pipe_model = make_pipeline(TfidfVectorizer(), LogisticRegression())
pipe_model.fit(X_train, y_train)

y_pred = pipe_model.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_score = accuracy_score(y_pred,y_test)
print('accuracy score : ',accuracy_score)

accuracy score :  0.9948186528497409


### 5. Evaluate Model

In [9]:
from sklearn.metrics import classification_report

print(classification_report(y_pred,y_test))

                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         3
                     Arts       1.00      1.00      1.00         3
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         9
         Business Analyst       1.00      1.00      1.00         6
           Civil Engineer       1.00      1.00      1.00         5
             Data Science       1.00      1.00      1.00         9
                 Database       1.00      1.00      1.00         7
          DevOps Engineer       0.91      1.00      0.95        10
         DotNet Developer       1.00      1.00      1.00         9
            ETL Developer       1.00      1.00      1.00         8
   Electrical Engineering       1.00      1.00      1.00         9
                       HR       1.00      1.00      1.00         5
                   Hadoop       1.00      1.00      1.00     

In [8]:
print(pipe_model.predict([cleanText("""
Ryan Patel
456 Code Avenue, Seattle, WA 98101 | (555) 987-6543 | ryan.patel@email.com | www.linkedin.com/in/ryanpatel | GitHub: github.com/ryanpatel
Professional Summary
Results-driven Java Developer with 5 years of experience designing, developing, and maintaining scalable Java applications. Proficient in object-oriented programming, Spring framework, and database management. Adept at delivering robust, efficient, and high-performing solutions in fast-paced environments.
Technical Skills
Programming Languages: Java, SQL, Python
Frameworks: Spring Boot, Hibernate, JavaFX, Maven
Web Technologies: JSP, Servlets, REST APIs, HTML/CSS
Databases: MySQL, PostgreSQL, MongoDB
Tools & Platforms: Git, Jenkins, Docker, Eclipse, IntelliJ IDEA
Other Skills: Agile/Scrum, Unit Testing (JUnit), Problem Solving, Code Optimization
Professional Experience
Java Developer
TechWave Solutions, Seattle, WA | June 2020 – Present
Designed and implemented RESTful APIs using Spring Boot, improving system performance by 25%.
Developed and maintained backend components for a customer management platform serving 50,000+ users.
Collaborated with front-end developers to integrate APIs, ensuring seamless user experience.
Conducted code reviews and implemented unit tests using JUnit, increasing code reliability by 30%.
Optimized SQL queries, reducing database response times by 20%.
Junior Java Developer
CodeCrafters Inc., Seattle, WA | January 2018 – May 2020
Assisted in developing internal enterprise applications using Java and Hibernate.
Implemented data validation and error handling to improve system stability.
Participated in Agile sprints, contributing to on-time delivery of multiple projects.
Documented software processes and created user manuals for deployed applications.
Education
Bachelor of Science in Computer Science
University of Washington, Seattle, WA | 2017
Certifications
Oracle Certified Professional, Java SE 11 Developer – Oracle, 2021
Spring Professional Certification – Pivotal/VMware, 2022
Projects / Portfolio
Inventory Management System: JavaFX desktop application with MySQL backend, streamlining warehouse operations.
E-commerce REST API: Built using Spring Boot and Hibernate, integrated with front-end web platform.
Automated Task Scheduler: Java application that reduces repetitive tasks for internal teams by 40%.
""")]))

['Java Developer']
