In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
import nltk
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report

warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

# Load dataset
file_path = "/content/resume_dataset (1).csv"  # Change to the actual path if needed
df = pd.read_csv(file_path, encoding="utf-8")

# Display first few rows
print(df.head())

# Show unique categories
print("\nUnique Resume Categories:\n", df["Category"].unique())

# Count plot of categories
plt.figure(figsize=(12, 6))
sns.countplot(y="Category", data=df, order=df["Category"].value_counts().index)
plt.title("Resume Category Distribution")
plt.xlabel("Count")
plt.ylabel("Category")
plt.show()

# Text Cleaning Function
def clean_text(text):
    text = re.sub(r"http\S+|www\S+", " ", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z]", " ", text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text

# Apply text cleaning
df["cleaned_resume"] = df["Resume"].apply(clean_text)

# Word Cloud
text_data = " ".join(df["cleaned_resume"])
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text_data)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# Label Encoding
le = LabelEncoder()
df["Category"] = le.fit_transform(df["Category"])

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words="english", max_features=1500)
X = vectorizer.fit_transform(df["cleaned_resume"])
y = df["Category"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Model
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make Predictions
y_pred_rf = rf_classifier.predict(X_test)

# Model Evaluation
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf, target_names=le.classes_))




In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

# Download required NLTK resources
nltk.download('punkt', download_dir='/usr/local/nltk_data')
nltk.download('stopwords', download_dir='/usr/local/nltk_data')

# Set the correct path for nltk data
nltk.data.path.append('/usr/local/nltk_data')


In [None]:
# Load dataset (update path if needed)
file_path = "/content/resume_dataset (1).csv"  # Update this if necessary
df = pd.read_csv(file_path)

# Check dataset structure
print("Columns in dataset:", df.columns)
print("Missing values in Resume column:", df['Resume'].isnull().sum())

# Fill any missing values
df['Resume'].fillna("", inplace=True)

# Convert to string (ensures no type issues)
df['Resume'] = df['Resume'].astype(str)

# Display sample data
print(df.head())


In [None]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    tokens = text.split()  # Simple tokenization (avoids NLTK issues)
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)


In [None]:
# Apply text preprocessing
df['cleaned_text'] = df['Resume'].apply(preprocess_text)

# Display processed data sample
print(df[['Resume', 'cleaned_text']].head())


In [None]:
from sklearn.model_selection import train_test_split

# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['Category'], test_size=0.2, random_state=42)

# Print sizes of the datasets
print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text into TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)  # Use top 5000 important words
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Print shape of transformed data
print("TF-IDF Train Shape:", X_train_tfidf.shape)
print("TF-IDF Test Shape:", X_test_tfidf.shape)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the TF-IDF feature set
rf_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test_tfidf)

# Evaluate model performance
print("Random Forest Model Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define improved LSTM model
lstm_model = Sequential([
    Embedding(10000, 256, input_length=200),  # Increased embedding size
    SpatialDropout1D(0.3),
    LSTM(150, dropout=0.3, recurrent_dropout=0.3, return_sequences=True),
    LSTM(100, dropout=0.3, recurrent_dropout=0.3),
    Dense(len(label_encoder.classes_), activation='softmax')
])

lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with class weights
lstm_model.fit(X_train_pad, y_train_encoded, epochs=20, batch_size=32, validation_data=(X_test_pad, y_test_encoded), class_weight=class_weights_dict)

# Evaluate model
y_pred_lstm = lstm_model.predict(X_test_pad)
y_pred_lstm_classes = y_pred_lstm.argmax(axis=1)
y_pred_lstm_labels = label_encoder.inverse_transform(y_pred_lstm_classes)

# Accuracy & Report
from sklearn.metrics import accuracy_score, classification_report
print("Improved LSTM Accuracy:", accuracy_score(y_test, y_pred_lstm_labels))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lstm_labels))


In [None]:
# Install necessary libraries (Run this in Colab)
!pip install transformers datasets

# Import Libraries
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/resume_dataset.csv')

# Encode Labels
label_encoder = LabelEncoder()
df['Category_Label'] = label_encoder.fit_transform(df['Category'])

# Split Data
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Resume'], df['Category_Label'], test_size=0.2, stratify=df['Category_Label'])

# Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize Texts
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

train_encodings = tokenize_function(train_texts.tolist())
test_encodings = tokenize_function(test_texts.tolist())

# Convert to Dataset Format
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels.tolist()})
test_dataset = Dataset.from_dict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': test_labels.tolist()})

# Load Pretrained BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train Model
trainer.train()

# Evaluate Model
trainer.evaluate()

# Predictions
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Convert Predictions to Labels
y_pred_labels = label_encoder.inverse_transform(preds)
y_true_labels = label_encoder.inverse_transform(test_labels)

# Classification Report
from sklearn.metrics import classification_report
print("BERT Model Classification Report:\n", classification_report(y_true_labels, y_pred_labels))


In [None]:
# Install necessary libraries (Run this in Colab)
!pip install transformers datasets

# Import Libraries
import torch
import numpy as np
import pandas as pd

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/resume_dataset.csv')

# Encode Labels
label_encoder = LabelEncoder()
df['Category_Label'] = label_encoder.fit_transform(df['Category'])

# Split Data
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Resume'], df['Category_Label'], test_size=0.2, stratify=df['Category_Label'])

# Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize Texts
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

train_encodings = tokenize_function(train_texts.tolist())
test_encodings = tokenize_function(test_texts.tolist())

# Convert to Dataset Format
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels.tolist()})
test_dataset = Dataset.from_dict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': test_labels.tolist()})

# Load Pretrained BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train Model
trainer.train()

# Evaluate Model
trainer.evaluate()

# Predictions
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Convert Predictions to Labels
y_pred_labels = label_encoder.inverse_transform(preds)
y_true_labels = label_encoder.inverse_transform(test_labels)

# Classification Report
from sklearn.metrics import classification_report
print("BERT Model Classification Report:\n", classification_report(y_true_labels, y_pred_labels))


In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from torch.utils.data import DataLoader

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/resume_dataset.csv')

# Encode labels
df['Category'] = df['Category'].astype('category').cat.codes

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Resume'].tolist(), df['Category'].tolist(), test_size=0.2, random_state=42
)

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"], "labels": train_labels})
val_dataset = Dataset.from_dict({"input_ids": val_encodings["input_ids"], "labels": val_labels})

# Load Pretrained Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=df['Category'].nunique())
model.to(device)

# Training Arguments (Faster Execution)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,         # Reduce epochs for speed
    per_device_train_batch_size=8,  # Lower batch size to reduce memory usage
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Evaluate
results = trainer.evaluate()
print("Evaluation Results:", results)


In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AdamW

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/resume_dataset.csv")

# Encode labels
df["Category"], category_labels = pd.factorize(df["Category"])

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Resume"], df["Category"], test_size=0.2, random_state=42
)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize inputs with attention masks
def tokenize_data(texts, labels):
    encodings = tokenizer(
        list(texts), truncation=True, padding=True, max_length=512, return_tensors="pt"
    )
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": torch.tensor(labels.values)
    }

train_dataset = tokenize_data(train_texts, train_labels)
val_dataset = tokenize_data(val_texts, val_labels)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict(train_dataset)
val_dataset = Dataset.from_dict(val_dataset)

# Load BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(category_labels))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Evaluate model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Save model
model.save_pretrained("/content/drive/MyDrive/bert_resume_classifier")
tokenizer.save_pretrained("/content/drive/MyDrive/bert_resume_classifier")


In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AdamW

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/resume_dataset.csv")

# Encode labels
df["Category"], category_labels = pd.factorize(df["Category"])

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Resume"], df["Category"], test_size=0.2, random_state=42
)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize inputs with attention masks
def tokenize_data(texts, labels):
    encodings = tokenizer(
        list(texts), truncation=True, padding=True, max_length=512, return_tensors="pt"
    )
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": torch.tensor(labels.values)
    }

train_dataset = tokenize_data(train_texts, train_labels)
val_dataset = tokenize_data(val_texts, val_labels)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict(train_dataset)
val_dataset = Dataset.from_dict(val_dataset)

# Load BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(category_labels))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Evaluate model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Save model
model.save_pretrained("/content/drive/MyDrive/bert_resume_classifier")
tokenizer.save_pretrained("/content/drive/MyDrive/bert_resume_classifier")
