<a href="https://colab.research.google.com/github/Sreeyanaidu/resume-screening/blob/main/Resume_screening_v1_0_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('/content/UpdatedResumeDataSet.csv')

# Clean text function
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text.lower()

df['cleaned_resume'] = df['Resume'].apply(clean_text)

# Convert job categories into binary outcome (Accepted = 1, Rejected = 0)
accepted_categories = ["Data Science", "Software Engineer", "Machine Learning Engineer", "Cyber Security", "AI Engineer"]
df['accepted'] = df['Category'].apply(lambda x: 1 if x in accepted_categories else 0)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_resume'], df['accepted'], test_size=0.2, random_state=42)

# Tokenization
tokenizer = Tokenizer(num_words=10000)  # Increased vocabulary size
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
X_train_pad = pad_sequences(X_train_seq, maxlen=500)
X_test_pad = pad_sequences(X_test_seq, maxlen=500)

# Handling Imbalanced Data
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i: class_weights[i] for i in range(len(class_weights))}

# Build LSTM Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=500),  # Increased vocab size
    Dropout(0.3),  # More dropout to prevent overfitting
    LSTM(100, return_sequences=True),  # First LSTM layer
    LSTM(50),  # Second LSTM layer
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#Train Model
model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_test_pad, y_test), class_weight=class_weights)

# Evaluate Model
loss, acc = model.evaluate(X_test_pad, y_test)
print("Test Accuracy:", acc)

# Function to Predict Resume Acceptance
def predict_acceptance(resume_text):
    resume_text = clean_text(resume_text)
    seq = tokenizer.texts_to_sequences([resume_text])
    pad = pad_sequences(seq, maxlen=500)
    pred = model.predict(pad)[0][0]
    return "Accepted " if pred > 0.5 else "Rejected"




Epoch 1/10




[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 1s/step - accuracy: 0.8523 - loss: 0.6556 - val_accuracy: 0.9896 - val_loss: 0.3017
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 1s/step - accuracy: 0.9752 - loss: 0.3182 - val_accuracy: 0.9896 - val_loss: 0.0553
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 965ms/step - accuracy: 0.9946 - loss: 0.0316 - val_accuracy: 1.0000 - val_loss: 0.0029
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 1s/step - accuracy: 1.0000 - loss: 0.0049 - val_accuracy: 1.0000 - val_loss: 0.0011
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 951ms/step - accuracy: 1.0000 - loss: 0.0020 - val_accuracy: 1.0000 - val_loss: 6.6998e-04
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 1s/step - accuracy: 1.0000 - loss: 0.0011 - val_accuracy: 1.0000 - val_loss: 4.5513e-04
Epoch 7/10
[1m25/25[0m [32m━━━━━━━

In [None]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv('/content/UpdatedResumeDataSet.csv')

# Clean text function
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text.lower()

df['cleaned_resume'] = df['Resume'].apply(clean_text)  # Apply text cleaning

# Encode job categories into numbers
encoder = LabelEncoder()
df['category_encoded'] = encoder.fit_transform(df['Category'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_resume'], df['category_encoded'], test_size=0.2, random_state=42)

# Tokenization
tokenizer = Tokenizer(num_words=5000)  # Limit vocab size
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
X_train_pad = pad_sequences(X_train_seq, maxlen=500)
X_test_pad = pad_sequences(X_test_seq, maxlen=500)

# Number of categories
num_classes = len(encoder.classes_)

# Build Model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=500))  # Word embeddings
model.add(Dropout(0.2))  # Prevent overfitting
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))  # LSTM layer
model.add(Dense(num_classes, activation='softmax'))  # Output layer

# Compile Model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Evaluate Model
loss, acc = model.evaluate(X_test_pad, y_test)
print("Test Accuracy:", acc)

# Function to Predict Job Category
def predict_job(resume_text):
    resume_text = clean_text(resume_text)
    seq = tokenizer.texts_to_sequences([resume_text])
    pad = pad_sequences(seq, maxlen=500)
    pred = model.predict(pad)
    return encoder.inverse_transform([np.argmax(pred)])[0]

# Example Predictions
sample_resumes = [
    "I am a software engineer with skills in Java, Python, and web development.",
    "Digital marketing expert skilled in SEO and social media campaigns.",
    "Cybersecurity analyst with experience in penetration testing and network security.",
    "Experienced data scientist working with Python, deep learning, and AI.",
    "HR professional with a strong background in recruitment and employee relations."
]

for resume in sample_resumes:
    print("Resume:", resume)
    print("Predicted Category:", predict_job(resume))
    print("-" * 30)




Epoch 1/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 1s/step - accuracy: 0.2128 - loss: 3.1977 - val_accuracy: 0.4093 - val_loss: 3.0841
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 1s/step - accuracy: 0.3565 - loss: 2.9517 - val_accuracy: 0.5544 - val_loss: 2.6204
Epoch 3/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 1s/step - accuracy: 0.5081 - loss: 2.3952 - val_accuracy: 0.6269 - val_loss: 1.8763
Epoch 4/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 1s/step - accuracy: 0.5779 - loss: 1.7430 - val_accuracy: 0.6425 - val_loss: 1.3832
Epoch 5/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 1s/step - accuracy: 0.6946 - loss: 1.3094 - val_accuracy: 0.7876 - val_loss: 1.0423
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 155ms/step - accuracy: 0.7906 - loss: 1.0443
Test Accuracy: 0.787564754486084
Resume: I am a software engineer with skills in Java, Python, and