#eda

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("/content/UpdatedResumeDataSet.csv")

In [5]:
df.head(1)

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...


In [6]:
print(df.iloc[0])

Category                                         Data Science
Resume      Skills * Programming Languages: Python (pandas...
Name: 0, dtype: object


In [7]:
df.describe()

Unnamed: 0,Category,Resume
count,962,962
unique,25,166
top,Java Developer,"Technical Skills Web Technologies: Angular JS,..."
freq,84,18


In [8]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [9]:
print(df.iloc[0]['Resume'])

Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃ¯ve Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language processing, Dimensionality reduction, Topic Modelling (LDA, NMF), PCA & Neural Nets. * Database Visualizations: Mysql, SqlServer, Cassandra, Hbase, ElasticSearch D3.js, DC.js, Plotly, kibana, matplotlib, ggplot, Tableau. * Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning.Education Details 

Data Science Assurance Associate 

Data Science Assurance Associate - Ernst & Young LLP
Skill Details 
JAVASCRIPT- Exprience - 24 months
jQuery- Exprience - 24 months
Python- Exprience - 24 monthsCompany Details 
company - Ernst & Young LLP
description - Fraud Investigations and Dispute Servic

In [10]:
df['Category'].nunique()

25

In [12]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Java Developer,84
Testing,70
DevOps Engineer,55
Python Developer,48
Web Designing,45
HR,44
Hadoop,42
Sales,40
Data Science,40
Mechanical Engineer,40


In [13]:
df.shape

(962, 2)

#model

In [16]:
def create_training_pairs(df):
    training_data = []

    # For each resume category, create positive and negative examples
    for category in df['Category'].unique():
        category_resumes = df[df['Category'] == category]

        for idx, resume_row in category_resumes.iterrows():
            # Positive example: resume matches its own category
            training_data.append({
                'resume_text': resume_row['Resume'],
                'job_category': category,
                'match_label': 1  # Positive match
            })

            # Negative examples: resume vs other categories
            other_categories = df[df['Category'] != category]['Category'].unique()
            for other_cat in np.random.choice(other_categories, size=2, replace=False):
                training_data.append({
                    'resume_text': resume_row['Resume'],
                    'job_category': other_cat,
                    'match_label': 0  # Negative match
                })

    return pd.DataFrame(training_data)

# Create training dataset
training_df = create_training_pairs(df)
print(f"Training data shape: {training_df.shape}")

Training data shape: (2886, 3)


In [17]:
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer

class ResumeFeatureExtractor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.tfidf = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1,2))

        # Define skill categories
        self.skills_db = {
            'programming': ['python', 'java', 'javascript', 'c++', 'sql', 'r'],
            'ml_ai': ['machine learning', 'deep learning', 'nlp', 'computer vision', 'tensorflow', 'pytorch'],
            'data': ['pandas', 'numpy', 'matplotlib', 'tableau', 'excel', 'powerbi'],
            'web': ['html', 'css', 'react', 'angular', 'node.js', 'django', 'flask'],
            'cloud': ['aws', 'azure', 'docker', 'kubernetes', 'jenkins'],
            'database': ['mysql', 'postgresql', 'mongodb', 'sqlite', 'elasticsearch']
        }

    def extract_skills(self, text):
        text_lower = text.lower()
        skill_counts = {}

        for category, skills in self.skills_db.items():
            count = sum(1 for skill in skills if skill in text_lower)
            skill_counts[f'{category}_skills'] = count

        return skill_counts

    def extract_experience_indicators(self, text):
        # Look for experience patterns
        experience_patterns = [
            r'(\d+)\+?\s*years?\s*(?:of\s*)?experience',
            r'(\d+)\+?\s*months?\s*(?:of\s*)?experience'
        ]

        years = 0
        for pattern in experience_patterns:
            matches = re.findall(pattern, text.lower())
            if matches:
                years = max(years, int(matches[0]))

        return {'experience_years': years}

    def extract_education_level(self, text):
        education_keywords = {
            'phd': 3, 'doctorate': 3, 'ph.d': 3,
            'master': 2, 'mba': 2, 'ms': 2, 'ma': 2,
            'bachelor': 1, 'bs': 1, 'ba': 1, 'degree': 1
        }

        text_lower = text.lower()
        max_education = 0

        for keyword, level in education_keywords.items():
            if keyword in text_lower:
                max_education = max(max_education, level)

        return {'education_level': max_education}

    def extract_all_features(self, text):
        features = {}

        # Skills
        features.update(self.extract_skills(text))

        # Experience
        features.update(self.extract_experience_indicators(text))

        # Education
        features.update(self.extract_education_level(text))

        # Text statistics
        features['text_length'] = len(text)
        features['word_count'] = len(text.split())

        return features

# Extract features
feature_extractor = ResumeFeatureExtractor()

# Process training data
print("Extracting features...")
training_features = []
for idx, row in training_df.iterrows():
    features = feature_extractor.extract_all_features(row['resume_text'])
    features['job_category'] = row['job_category']
    features['match_label'] = row['match_label']
    training_features.append(features)

features_df = pd.DataFrame(training_features)
print(f"Features extracted: {features_df.shape}")


Extracting features...
Features extracted: (2886, 12)


In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Prepare features and labels
feature_columns = [col for col in features_df.columns if col not in ['job_category', 'match_label']]
X = features_df[feature_columns]
y = features_df['match_label']

# Encode job categories
label_encoder = LabelEncoder()
features_df['job_category_encoded'] = label_encoder.fit_transform(features_df['job_category'])
X['job_category_encoded'] = features_df['job_category_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'SVM': SVC(probability=True, random_state=42)
}

trained_models = {}
model_scores = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    trained_models[name] = model
    model_scores[name] = accuracy

    print(f"{name} Accuracy: {accuracy:.3f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Best model
best_model_name = max(model_scores, key=model_scores.get)
best_model = trained_models[best_model_name]
print(f"\nBest Model: {best_model_name} (Accuracy: {model_scores[best_model_name]:.3f})")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['job_category_encoded'] = features_df['job_category_encoded']



Training Random Forest...
Random Forest Accuracy: 0.950
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96       385
           1       0.87      0.99      0.93       193

    accuracy                           0.95       578
   macro avg       0.93      0.96      0.95       578
weighted avg       0.96      0.95      0.95       578


Training Gradient Boosting...
Gradient Boosting Accuracy: 0.792
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.95      0.86       385
           1       0.83      0.47      0.60       193

    accuracy                           0.79       578
   macro avg       0.81      0.71      0.73       578
weighted avg       0.80      0.79      0.77       578


Training Logistic Regression...
Logistic Regression Accuracy: 0.666
Classification Report:
              precision    recall  f1-score   support

           0       0.67      1.

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVM Accuracy: 0.666
Classification Report:
              precision    recall  f1-score   support

           0       0.67      1.00      0.80       385
           1       0.00      0.00      0.00       193

    accuracy                           0.67       578
   macro avg       0.33      0.50      0.40       578
weighted avg       0.44      0.67      0.53       578


Best Model: Random Forest (Accuracy: 0.950)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
from sklearn.metrics.pairwise import cosine_similarity

class TextSimilarityModel:
    def __init__(self):
        self.tfidf = TfidfVectorizer(
            max_features=1000,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=2
        )
        self.fitted = False

    def fit(self, texts):
        self.tfidf.fit(texts)
        self.fitted = True

    def calculate_similarity(self, text1, text2):
        if not self.fitted:
            # Fit on both texts if not fitted
            self.tfidf.fit([text1, text2])

        # Transform texts
        vectors = self.tfidf.transform([text1, text2])

        # Calculate similarity
        similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
        return similarity

    def batch_similarity(self, resume_texts, job_descriptions):
        # Fit on all texts
        all_texts = resume_texts + job_descriptions
        self.tfidf.fit(all_texts)

        # Transform
        resume_vectors = self.tfidf.transform(resume_texts)
        job_vectors = self.tfidf.transform(job_descriptions)

        # Calculate similarity matrix
        similarity_matrix = cosine_similarity(resume_vectors, job_vectors)
        return similarity_matrix

# Train text similarity model
text_sim_model = TextSimilarityModel()
all_resume_texts = df['Resume'].tolist()
text_sim_model.fit(all_resume_texts)


In [21]:
class ResumeJobMatcher:
    def __init__(self, classification_model, text_similarity_model, feature_extractor, label_encoder):
        self.classification_model = classification_model
        self.text_similarity_model = text_similarity_model
        self.feature_extractor = feature_extractor
        self.label_encoder = label_encoder

    def match_resume_to_jobs(self, resume_text, job_categories):
        results = []

        for job_category in job_categories:
            # Extract features
            features = self.feature_extractor.extract_all_features(resume_text)

            # Encode job category
            try:
                job_encoded = self.label_encoder.transform([job_category])[0]
            except:
                job_encoded = 0  # Unknown category

            features['job_category_encoded'] = job_encoded

            # Prepare feature vector
            feature_vector = pd.DataFrame([features])
            feature_columns = [col for col in feature_vector.columns if col != 'job_category']
            X = feature_vector[feature_columns]

            # Get match probability
            match_probability = self.classification_model.predict_proba(X)[0][1]

            # Calculate text similarity (using category as proxy for job description)
            text_similarity = self.text_similarity_model.calculate_similarity(
                resume_text,
                job_category  # In real case, this would be job description
            )

            # Combined score
            combined_score = (match_probability * 0.6) + (text_similarity * 0.4)

            results.append({
                'job_category': job_category,
                'match_probability': round(match_probability, 3),
                'text_similarity': round(text_similarity, 3),
                'combined_score': round(combined_score, 3)
            })

        # Sort by combined score
        results.sort(key=lambda x: x['combined_score'], reverse=True)
        return results

# Create matcher
matcher = ResumeJobMatcher(
    best_model,
    text_sim_model,
    feature_extractor,
    label_encoder
)


In [22]:
# Test with a sample resume
sample_resume = df.iloc[0]['Resume']
available_job_categories = df['Category'].unique()[:10]  # Test with top 10 categories

print("Testing Resume Matching...")
print("="*50)

matches = matcher.match_resume_to_jobs(sample_resume, available_job_categories)

print("Top 5 Job Matches:")
for i, match in enumerate(matches[:5], 1):
    print(f"{i}. {match['job_category']}")
    print(f"   Combined Score: {match['combined_score']}")
    print(f"   Match Probability: {match['match_probability']}")
    print(f"   Text Similarity: {match['text_similarity']}")
    print()

# Test accuracy on multiple resumes
print("\nTesting on random resumes...")
test_accuracies = []

for i in range(10):  # Test 10 random resumes
    random_idx = np.random.randint(0, len(df))
    test_resume = df.iloc[random_idx]

    matches = matcher.match_resume_to_jobs(test_resume['Resume'], available_job_categories)

    # Check if correct category is in top 3
    top_3_categories = [match['job_category'] for match in matches[:3]]
    is_correct = test_resume['Category'] in top_3_categories
    test_accuracies.append(is_correct)

    print(f"Resume {i+1}: Correct category in top 3? {is_correct}")

print(f"\nOverall Top-3 Accuracy: {np.mean(test_accuracies):.2f}")


Testing Resume Matching...
Top 5 Job Matches:
1. Data Science
   Combined Score: 0.636
   Match Probability: 0.89
   Text Similarity: 0.254

2. Civil Engineer
   Combined Score: 0.438
   Match Probability: 0.73
   Text Similarity: 0.0

3. Arts
   Combined Score: 0.312
   Match Probability: 0.52
   Text Similarity: 0.0

4. Advocate
   Combined Score: 0.306
   Match Probability: 0.51
   Text Similarity: 0.0

5. Java Developer
   Combined Score: 0.03
   Match Probability: 0.04
   Text Similarity: 0.014


Testing on random resumes...
Resume 1: Correct category in top 3? False
Resume 2: Correct category in top 3? False
Resume 3: Correct category in top 3? False
Resume 4: Correct category in top 3? False
Resume 5: Correct category in top 3? True
Resume 6: Correct category in top 3? True
Resume 7: Correct category in top 3? False
Resume 8: Correct category in top 3? False
Resume 9: Correct category in top 3? False
Resume 10: Correct category in top 3? False

Overall Top-3 Accuracy: 0.20


#new model

In [23]:
import pandas as pd
import numpy as np
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from collections import Counter

class EnhancedResumeFeatureExtractor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")

        # Comprehensive skills database
        self.skills_db = {
            'programming': ['python', 'java', 'javascript', 'c++', 'c#', 'sql', 'r', 'scala', 'go', 'rust'],
            'web_dev': ['html', 'css', 'react', 'angular', 'vue', 'node.js', 'django', 'flask', 'express'],
            'data_science': ['pandas', 'numpy', 'matplotlib', 'seaborn', 'plotly', 'tableau', 'powerbi'],
            'ml_ai': ['machine learning', 'deep learning', 'nlp', 'computer vision', 'tensorflow', 'pytorch', 'scikit-learn'],
            'cloud': ['aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'terraform'],
            'databases': ['mysql', 'postgresql', 'mongodb', 'sqlite', 'redis', 'cassandra', 'elasticsearch'],
            'tools': ['git', 'jira', 'confluence', 'slack', 'excel', 'linux', 'windows']
        }

        # Flatten for easy access
        self.all_skills = []
        for category, skills in self.skills_db.items():
            self.all_skills.extend(skills)

    def extract_skills_with_context(self, text):
        """Extract skills with experience context"""
        text_lower = text.lower()
        skill_features = {}

        for category, skills in self.skills_db.items():
            category_count = 0
            total_experience = 0

            for skill in skills:
                if skill in text_lower:
                    category_count += 1

                    # Look for experience patterns near this skill
                    skill_pattern = f".{{0,50}}{skill}.{{0,50}}"
                    skill_context = re.search(skill_pattern, text_lower)

                    if skill_context:
                        context = skill_context.group()
                        exp_match = re.search(r'(\d+)\s*(?:years?|months?)', context)
                        if exp_match:
                            exp_value = int(exp_match.group(1))
                            # Convert months to years
                            if 'month' in context:
                                exp_value = exp_value / 12
                            total_experience += exp_value

            skill_features[f'{category}_count'] = category_count
            skill_features[f'{category}_avg_exp'] = total_experience / max(category_count, 1)

        return skill_features

    def extract_experience_indicators(self, text):
        """Extract total experience and seniority indicators"""
        text_lower = text.lower()

        # Extract years of experience
        experience_patterns = [
            r'(\d+)\+?\s*years?\s*(?:of\s*)?experience',
            r'(\d+)\+?\s*years?\s*(?:in|with)',
            r'experience\s*(?:of\s*)?(\d+)\+?\s*years?'
        ]

        years = []
        for pattern in experience_patterns:
            matches = re.findall(pattern, text_lower)
            years.extend([int(match) for match in matches])

        total_exp = max(years) if years else 0

        # Seniority indicators
        seniority_keywords = {
            'senior': 3, 'lead': 4, 'principal': 5, 'architect': 5,
            'manager': 4, 'director': 5, 'head': 5, 'junior': 1,
            'intern': 0, 'entry': 1, 'associate': 2
        }

        seniority_score = 0
        for keyword, score in seniority_keywords.items():
            if keyword in text_lower:
                seniority_score = max(seniority_score, score)

        return {
            'total_experience': total_exp,
            'seniority_level': seniority_score
        }

    def extract_education_features(self, text):
        """Extract education details"""
        text_lower = text.lower()

        # Education levels
        education_levels = {
            'phd': 4, 'doctorate': 4, 'ph.d': 4,
            'master': 3, 'mba': 3, 'ms': 3, 'ma': 3,
            'bachelor': 2, 'bs': 2, 'ba': 2,
            'associate': 1, 'diploma': 1
        }

        education_level = 0
        for degree, level in education_levels.items():
            if degree in text_lower:
                education_level = max(education_level, level)

        # Technical fields
        tech_fields = ['computer science', 'engineering', 'mathematics', 'statistics', 'data science']
        is_tech_background = any(field in text_lower for field in tech_fields)

        return {
            'education_level': education_level,
            'tech_background': int(is_tech_background)
        }

    def extract_project_indicators(self, text):
        """Extract project and achievement indicators"""
        text_lower = text.lower()

        # Project indicators
        project_keywords = ['project', 'developed', 'built', 'created', 'implemented', 'designed']
        project_count = sum(text_lower.count(keyword) for keyword in project_keywords)

        # Achievement indicators
        achievement_keywords = ['achieved', 'improved', 'increased', 'reduced', 'optimized', 'enhanced']
        achievement_count = sum(text_lower.count(keyword) for keyword in achievement_keywords)

        # Leadership indicators
        leadership_keywords = ['led', 'managed', 'supervised', 'coordinated', 'mentored']
        leadership_count = sum(text_lower.count(keyword) for keyword in leadership_keywords)

        return {
            'project_count': min(project_count, 20),  # Cap at reasonable number
            'achievement_count': min(achievement_count, 10),
            'leadership_count': min(leadership_count, 10)
        }

    def extract_all_features(self, text):
        """Extract comprehensive features"""
        features = {}

        # Skills with context
        features.update(self.extract_skills_with_context(text))

        # Experience
        features.update(self.extract_experience_indicators(text))

        # Education
        features.update(self.extract_education_features(text))

        # Projects and achievements
        features.update(self.extract_project_indicators(text))

        # Text statistics
        features['text_length'] = len(text)
        features['word_count'] = len(text.split())
        features['unique_words'] = len(set(text.lower().split()))

        return features

# Initialize enhanced feature extractor
enhanced_extractor = EnhancedResumeFeatureExtractor()


In [24]:
def create_improved_training_pairs(df, samples_per_resume=3):
    """Create better training pairs with strategic negative sampling"""
    training_data = []

    # Get category similarity mapping
    category_similarity = calculate_category_similarity(df)

    for category in df['Category'].unique():
        category_resumes = df[df['Category'] == category]

        for idx, resume_row in category_resumes.iterrows():
            # Positive example
            training_data.append({
                'resume_text': resume_row['Resume'],
                'job_category': category,
                'match_label': 1
            })

            # Strategic negative examples
            # 1. Most similar category (hard negative)
            similar_categories = category_similarity.get(category, [])
            if similar_categories:
                training_data.append({
                    'resume_text': resume_row['Resume'],
                    'job_category': similar_categories[0],
                    'match_label': 0
                })

            # 2. Random dissimilar category
            other_categories = [cat for cat in df['Category'].unique()
                             if cat != category and cat not in similar_categories[:2]]
            if other_categories:
                random_category = np.random.choice(other_categories)
                training_data.append({
                    'resume_text': resume_row['Resume'],
                    'job_category': random_category,
                    'match_label': 0
                })

    return pd.DataFrame(training_data)

def calculate_category_similarity(df):
    """Calculate which categories are most similar based on resume content"""
    # This is a simplified version - you can make it more sophisticated
    category_keywords = {}

    for category in df['Category'].unique():
        category_resumes = df[df['Category'] == category]['Resume'].str.lower()
        all_text = ' '.join(category_resumes)

        # Extract most common words (simplified similarity)
        words = re.findall(r'\b\w+\b', all_text)
        common_words = [word for word, count in Counter(words).most_common(50)]
        category_keywords[category] = set(common_words)

    # Calculate similarity between categories
    similarity_map = {}
    for cat1 in category_keywords:
        similarities = []
        for cat2 in category_keywords:
            if cat1 != cat2:
                intersection = len(category_keywords[cat1] & category_keywords[cat2])
                union = len(category_keywords[cat1] | category_keywords[cat2])
                similarity = intersection / union if union > 0 else 0
                similarities.append((cat2, similarity))

        # Sort by similarity
        similarities.sort(key=lambda x: x[1], reverse=True)
        similarity_map[cat1] = [cat for cat, sim in similarities]

    return similarity_map


In [26]:
# Create improved training data
print("Creating improved training data...")
improved_training_df = create_improved_training_pairs(df)
print(f"Training data shape: {improved_training_df.shape}")

# Extract enhanced features
print("Extracting enhanced features...")
enhanced_features = []

for idx, row in improved_training_df.iterrows():
    if idx % 100 == 0:
        print(f"Processing {idx}/{len(improved_training_df)}")

    features = enhanced_extractor.extract_all_features(row['resume_text'])
    features['job_category'] = row['job_category']
    features['match_label'] = row['match_label']
    enhanced_features.append(features)

enhanced_features_df = pd.DataFrame(enhanced_features)

# Prepare for training
feature_columns = [col for col in enhanced_features_df.columns
                  if col not in ['job_category', 'match_label']]

# Encode job categories
label_encoder = LabelEncoder()
enhanced_features_df['job_category_encoded'] = label_encoder.fit_transform(enhanced_features_df['job_category'])

X = enhanced_features_df[feature_columns + ['job_category_encoded']]
y = enhanced_features_df['match_label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train improved model
print("Training enhanced model...")
enhanced_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    class_weight='balanced'
)

enhanced_model.fit(X_train, y_train)

# Evaluate
y_pred = enhanced_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Enhanced Model Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = (
    pd.DataFrame({
        'feature': X.columns,
        'importance': enhanced_model.feature_importances_
    })
    .sort_values('importance', ascending=False)   # use ascending=False
)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Creating improved training data...
Training data shape: (2886, 3)
Extracting enhanced features...
Processing 0/2886
Processing 100/2886
Processing 200/2886
Processing 300/2886
Processing 400/2886
Processing 500/2886
Processing 600/2886
Processing 700/2886
Processing 800/2886
Processing 900/2886
Processing 1000/2886
Processing 1100/2886
Processing 1200/2886
Processing 1300/2886
Processing 1400/2886
Processing 1500/2886
Processing 1600/2886
Processing 1700/2886
Processing 1800/2886
Processing 1900/2886
Processing 2000/2886
Processing 2100/2886
Processing 2200/2886
Processing 2300/2886
Processing 2400/2886
Processing 2500/2886
Processing 2600/2886
Processing 2700/2886
Processing 2800/2886
Training enhanced model...
Enhanced Model Accuracy: 0.910

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.88      0.93       385
           1       0.80      0.97      0.88       193

    accuracy                           0.91       578
   ma

In [28]:
def test_enhanced_model(model, feature_extractor, label_encoder, df, n_tests=20):
    correct_predictions = 0
    detailed_results = []

    print("Testing Enhanced Model...")
    print("="*50)

    for i in range(n_tests):
        # Random resume
        random_idx = np.random.randint(0, len(df))
        test_resume = df.iloc[random_idx]

        # Get all unique categories
        all_categories = df['Category'].unique()

        # Calculate scores for all categories
        category_scores = []

        for category in all_categories:
            # Extract features
            features = feature_extractor.extract_all_features(test_resume['Resume'])

            # Encode category
            try:
                cat_encoded = label_encoder.transform([category])[0]
            except ValueError:
                print(f"Warning: Unknown category {category}, skipping...")
                continue

            features['job_category_encoded'] = cat_encoded

            # Prepare feature vector
            feature_vector = pd.DataFrame([features])

            # Handle missing columns by filling with zeros
            missing_cols = set(X.columns) - set(feature_vector.columns)
            for col in missing_cols:
                feature_vector[col] = 0

            # Reorder columns to match training data
            X_test_single = feature_vector[X.columns]

            # Get prediction probability
            try:
                prob = model.predict_proba(X_test_single)[0][1]
                category_scores.append((category, prob))
            except Exception as e:
                print(f"Error predicting for category {category}: {e}")
                continue

        # Sort by score
        category_scores.sort(key=lambda x: x[1], reverse=True)

        # Check if correct category is in top 3
        top_3_categories = [cat for cat, score in category_scores[:3]]
        top_3_scores = [score for cat, score in category_scores[:3]]

        is_correct = test_resume['Category'] in top_3_categories

        if is_correct:
            correct_predictions += 1

        # Store detailed results
        result = {
            'resume_idx': random_idx,
            'true_category': test_resume['Category'],
            'top_3_predictions': list(zip(top_3_categories, top_3_scores)),
            'is_correct': is_correct
        }
        detailed_results.append(result)

        print(f"Resume {i+1}: {test_resume['Category']} -> Top 3: {top_3_categories} | Correct: {is_correct}")

    accuracy = correct_predictions / n_tests
    print(f"\nEnhanced Model Top-3 Accuracy: {accuracy:.2f} ({correct_predictions}/{n_tests})")

    return accuracy, detailed_results

# Now run the test
enhanced_accuracy, results = test_enhanced_model(enhanced_model, enhanced_extractor, label_encoder, df)


Testing Enhanced Model...
Resume 1: Database -> Top 3: ['Database', 'DevOps Engineer', 'ETL Developer'] | Correct: True
Resume 2: SAP Developer -> Top 3: ['SAP Developer', 'Python Developer', 'PMO'] | Correct: True
Resume 3: Web Designing -> Top 3: ['Web Designing', 'Testing', 'Python Developer'] | Correct: True
Resume 4: SAP Developer -> Top 3: ['SAP Developer', 'Sales', 'Python Developer'] | Correct: True
Resume 5: Operations Manager -> Top 3: ['Operations Manager', 'PMO', 'Python Developer'] | Correct: True
Resume 6: Automation Testing -> Top 3: ['Automation Testing', 'Blockchain', 'Arts'] | Correct: True
Resume 7: ETL Developer -> Top 3: ['ETL Developer', 'Electrical Engineering', 'DotNet Developer'] | Correct: True
Resume 8: Blockchain -> Top 3: ['Blockchain', 'Business Analyst', 'Civil Engineer'] | Correct: True
Resume 9: Business Analyst -> Top 3: ['HR', 'Business Analyst', 'Hadoop'] | Correct: True
Resume 10: Python Developer -> Top 3: ['Python Developer', 'SAP Developer', 'PMO

In [29]:
def test_enhanced_model_top1(model, feature_extractor, label_encoder, df, n_tests=20):
    correct_predictions = 0
    detailed_results = []

    print("Testing Enhanced Model (Top-1 Accuracy)...")
    print("="*50)

    for i in range(n_tests):
        # Random resume
        random_idx = np.random.randint(0, len(df))
        test_resume = df.iloc[random_idx]

        # Get all unique categories
        all_categories = df['Category'].unique()

        # Calculate scores for all categories
        category_scores = []

        for category in all_categories:
            # Extract features
            features = feature_extractor.extract_all_features(test_resume['Resume'])

            # Encode category
            try:
                cat_encoded = label_encoder.transform([category])[0]
            except ValueError:
                print(f"Warning: Unknown category {category}, skipping...")
                continue

            features['job_category_encoded'] = cat_encoded

            # Prepare feature vector
            feature_vector = pd.DataFrame([features])

            # Handle missing columns by filling with zeros
            missing_cols = set(X.columns) - set(feature_vector.columns)
            for col in missing_cols:
                feature_vector[col] = 0

            # Reorder columns to match training data
            X_test_single = feature_vector[X.columns]

            # Get prediction probability
            try:
                prob = model.predict_proba(X_test_single)[0][1]
                category_scores.append((category, prob))
            except Exception as e:
                print(f"Error predicting for category {category}: {e}")
                continue

        # Sort by score
        category_scores.sort(key=lambda x: x[1], reverse=True)

        # Check if correct category is TOP 1 (best prediction)
        top_1_category = category_scores[0][0]  # Only the best prediction
        top_1_score = category_scores[0][1]

        is_correct = test_resume['Category'] == top_1_category  # Exact match only

        if is_correct:
            correct_predictions += 1

        # Store detailed results
        result = {
            'resume_idx': random_idx,
            'true_category': test_resume['Category'],
            'predicted_category': top_1_category,
            'prediction_score': top_1_score,
            'is_correct': is_correct
        }
        detailed_results.append(result)

        print(f"Resume {i+1}: True: '{test_resume['Category']}' | Predicted: '{top_1_category}' | Score: {top_1_score:.3f} | Correct: {is_correct}")

    accuracy = correct_predictions / n_tests
    print(f"\nEnhanced Model Top-1 Accuracy: {accuracy:.2f} ({correct_predictions}/{n_tests})")

    # Show failed predictions for analysis
    print("\nFailed Predictions:")
    for result in detailed_results:
        if not result['is_correct']:
            print(f"  True: {result['true_category']} | Predicted: {result['predicted_category']} | Score: {result['prediction_score']:.3f}")

    return accuracy, detailed_results

# Run the top-1 test
top1_accuracy, top1_results = test_enhanced_model_top1(enhanced_model, enhanced_extractor, label_encoder, df)


Testing Enhanced Model (Top-1 Accuracy)...
Resume 1: True: 'Mechanical Engineer' | Predicted: 'Mechanical Engineer' | Score: 0.791 | Correct: True
Resume 2: True: 'SAP Developer' | Predicted: 'SAP Developer' | Score: 0.770 | Correct: True
Resume 3: True: 'Civil Engineer' | Predicted: 'Civil Engineer' | Score: 0.593 | Correct: True
Resume 4: True: 'Automation Testing' | Predicted: 'Automation Testing' | Score: 0.556 | Correct: True
Resume 5: True: 'Business Analyst' | Predicted: 'Business Analyst' | Score: 0.884 | Correct: True
Resume 6: True: 'Data Science' | Predicted: 'Data Science' | Score: 0.701 | Correct: True
Resume 7: True: 'Blockchain' | Predicted: 'Blockchain' | Score: 0.819 | Correct: True
Resume 8: True: 'Hadoop' | Predicted: 'Hadoop' | Score: 0.880 | Correct: True
Resume 9: True: 'Electrical Engineering' | Predicted: 'Electrical Engineering' | Score: 0.773 | Correct: True
Resume 10: True: 'Civil Engineer' | Predicted: 'Civil Engineer' | Score: 0.588 | Correct: True
Resume 1

In [31]:
!pip install PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [32]:
import PyPDF2
import pandas as pd

# Function to extract text from your PDF
def extract_pdf_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# Test your resume
def test_my_resume(pdf_path):
    # Extract text from your PDF
    my_resume_text = extract_pdf_text(pdf_path)

    print("Extracted Text (first 500 chars):")
    print(my_resume_text[:500])
    print("\n" + "="*50)

    # Use the enhanced model we trained
    all_categories = df['Category'].unique()
    category_scores = []

    for category in all_categories:
        # Extract features using our enhanced feature extractor
        features = enhanced_extractor.extract_all_features(my_resume_text)

        # Encode category
        try:
            cat_encoded = label_encoder.transform([category])[0]
        except ValueError:
            continue

        features['job_category_encoded'] = cat_encoded

        # Prepare feature vector
        feature_vector = pd.DataFrame([features])

        # Handle missing columns
        missing_cols = set(X.columns) - set(feature_vector.columns)
        for col in missing_cols:
            feature_vector[col] = 0

        # Predict
        X_test_single = feature_vector[X.columns]
        prob = enhanced_model.predict_proba(X_test_single)[0][1]
        category_scores.append((category, prob))

    # Sort by score
    category_scores.sort(key=lambda x: x[1], reverse=True)

    print("🎯 TOP 5 RECOMMENDED ROLES FOR YOUR RESUME:")
    print("="*50)

    for i, (category, score) in enumerate(category_scores[:5], 1):
        confidence = "🟢 High" if score > 0.7 else "🟡 Medium" if score > 0.5 else "🔴 Low"
        print(f"{i}. {category}")
        print(f"   Score: {score:.3f} ({score*100:.1f}% match)")
        print(f"   Confidence: {confidence}")
        print()

    # Extract your skills for analysis
    extracted_skills = enhanced_extractor.extract_all_features(my_resume_text)

    print("📋 YOUR EXTRACTED SKILLS:")
    print("="*30)
    for skill_type, count in extracted_skills.items():
        if 'count' in skill_type and count > 0:
            print(f"{skill_type.replace('_count', '').title()}: {count} skills")

    return category_scores, extracted_skills

# Run this with your PDF path
# category_scores, my_skills = test_my_resume("path/to/your/resume.pdf")


In [34]:
# Run this in Google Colab
from google.colab import files
import PyPDF2

# 1. Upload your PDF
print("Click 'Choose Files' and select your resume PDF:")
uploaded = files.upload()

# 2. Get filename and extract text
pdf_filename = list(uploaded.keys())[0]

def extract_pdf_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# 3. Test your resume
my_resume_text = extract_pdf_text(pdf_filename)
print("Extracted text preview:")
print(my_resume_text[:500])

# 4. Get your job matches
category_scores, extracted_skills = test_my_resume(pdf_filename)


Click 'Choose Files' and select your resume PDF:


Saving info_edge_data_sciecne.pdf to info_edge_data_sciecne.pdf
Extracted text preview:
Saksham Vinod Kurai +91-8275321578
BTech, Mechanical Engineering sakshamkurai17@gmail.com
Indian Institute Of Technology, Jodhpur GitHub |LinkedIn
Education
Degree/Certificate Institute/Board CGPA/Percentage Year
B.Tech. (Mechanical) Indian Institute of Technology, Jodhpur 8.07 (Current) 2023-Present
H.S.C. (CS Major) Maharashtra Board 82.5% 2023
S.S.C. Maharashtra Board 97.6% 2021
Projects
•Cell Detection Using CNN Aug 2024 – Nov 2024
Course Project , Tools: CNN, Python GitHub
–Designed a CNN t
Extracted Text (first 500 chars):
Saksham Vinod Kurai +91-8275321578
BTech, Mechanical Engineering sakshamkurai17@gmail.com
Indian Institute Of Technology, Jodhpur GitHub |LinkedIn
Education
Degree/Certificate Institute/Board CGPA/Percentage Year
B.Tech. (Mechanical) Indian Institute of Technology, Jodhpur 8.07 (Current) 2023-Present
H.S.C. (CS Major) Maharashtra Board 82.5% 2023
S.S.C. Maharashtra Board 97.