In [3]:
!pip install spacy PyPDF2 scikit-learn
!python -m spacy download en_core_web_sm



Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import PyPDF2
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Step 1: Function to extract text from PDF
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Step 2: Preprocess text
def preprocess(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)

# Step 3: Dummy training data
texts = [
    "develop python machine learning algorithm analyze data predict model",
    "design web html css javascript react application front end backend",
    "test mobile app android ios debug interface",
    "analyze financial report investment forecast risk",
    "write academic research paper journal publication"
]
labels = ["Data Scientist", "Web Developer", "App Developer", "Financial Analyst", "Researcher"]

# Step 4: Preprocess dummy data
processed_texts = [preprocess(text) for text in texts]

# Step 5: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_texts)
y = labels

# Step 6: Train classifier
clf = LogisticRegression()
clf.fit(X, y)

# Step 7: Load and preprocess uploaded resume
resume_text = extract_text_from_pdf("/content/S.ROSELIN MARY JOVITA RESUME.pdf")  # Change filename
processed_resume = preprocess(resume_text)
resume_vector = vectorizer.transform([processed_resume])

# Step 8: Predict category
predicted_category = clf.predict(resume_vector)[0]
print(f"Predicted Job Category: {predicted_category}")


Predicted Job Category: Data Scientist
