In [None]:
# 1) Setup
!pip -q install scikit-learn pandas joblib

In [None]:
# 2) Upload the CSV (choose UpdatedResumeDataSet.csv in the dialog)
from google.colab import files
uploaded = files.upload()

Saving UpdatedResumeDataSet.csv to UpdatedResumeDataSet.csv


In [None]:
# Force Colab to use sklearn 1.7.2
!pip install --upgrade scikit-learn==1.7.2
import sklearn
print("✅ Using scikit-learn version:", sklearn.__version__)


Collecting scikit-learn==1.7.2
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.7.2
✅ Using scikit-learn version: 1.7.2


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import joblib
import re

# Configuration
CSV = "UpdatedResumeDataSet.csv"
TEXT_COL = "Resume"
LABEL_COL = "Category"
MIN_SAMPLES_PER_CLASS = 4  # Minimum samples required per class
TEST_SIZE = 0.2
RANDOM_STATE = 42

def clean(t):
    """Enhanced text cleaning function"""
    import re
    # Convert to string
    t = str(t)
    # Remove HTML tags
    t = re.sub(r"<[^>]+>", " ", t)
    # Remove URLs
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    # Remove extra whitespace (but keep original case and punctuation)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def preprocess_data(df, text_col, label_col, min_samples=MIN_SAMPLES_PER_CLASS):
    """Preprocess data by cleaning and filtering classes"""
    print("Original data shape:", df.shape)

    # Clean text
    df[text_col] = df[text_col].astype(str).map(clean)

    # Note: NOT removing duplicates because dataset contains intentional
    # duplicate resumes across categories for data augmentation
    print("After cleaning (duplicates kept):", df.shape)

    # Filter classes with sufficient samples
    class_counts = df[label_col].value_counts()
    print(f"\nOriginal class distribution:\n{class_counts}")

    valid_classes = class_counts[class_counts >= min_samples].index
    df_filtered = df[df[label_col].isin(valid_classes)].reset_index(drop=True)

    removed_classes = set(class_counts.index) - set(valid_classes)
    if removed_classes:
        print(f"\nRemoved classes with < {min_samples} samples: {removed_classes}")

    print(f"\nFinal class distribution:\n{df_filtered[label_col].value_counts()}")
    print("Final data shape:", df_filtered.shape)

    return df_filtered

def create_model_pipeline():
    """Create optimized model pipeline"""
    return Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=5000,      # Limit features to prevent overfitting
            ngram_range=(1, 2),     # Use unigrams and bigrams
            min_df=2,               # Ignore terms appearing in < 2 documents
            max_df=0.95,            # Ignore terms appearing in > 95% of documents
            stop_words='english',   # Remove common English stop words
            lowercase=True          # Normalize case
        )),
        ('classifier', LogisticRegression(
            max_iter=1000,
            class_weight='balanced',  # Handle class imbalance
            random_state=RANDOM_STATE,
            C=1.0                    # Regularization strength
        ))
    ])

def evaluate_model(pipeline, X_train, X_test, y_train, y_test):
    """Train and evaluate the model"""
    print("\n" + "="*60)
    print("TRAINING AND EVALUATION")
    print("="*60)

    # Train the model
    print("Training model...")
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nTest Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

    # Cross-validation
    print("\nPerforming 5-fold cross-validation...")
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    print(f"CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std()*2:.4f}")

    # Detailed classification report
    print(f"\nDetailed Classification Report:")
    print("-" * 60)
    print(classification_report(y_test, y_pred, zero_division=0))

    return pipeline, accuracy

def main():
    print("Resume Classification with Improved Pipeline")
    print("=" * 60)

    # Load data
    try:
        df = pd.read_csv(CSV)[[TEXT_COL, LABEL_COL]].dropna()
        print(f"Loaded {len(df)} samples from {CSV}")
    except FileNotFoundError:
        print(f"Error: Could not find {CSV}")
        return
    except KeyError as e:
        print(f"Error: Column not found - {e}")
        return

    # Preprocess data
    df_processed = preprocess_data(df, TEXT_COL, LABEL_COL)

    if len(df_processed) < 20:
        print("Error: Not enough data after preprocessing")
        return

    # Split data with stratification
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            df_processed[TEXT_COL],
            df_processed[LABEL_COL],
            test_size=TEST_SIZE,
            stratify=df_processed[LABEL_COL],
            random_state=RANDOM_STATE
        )
        print(f"\nTrain set: {len(X_train)} samples")
        print(f"Test set: {len(X_test)} samples")
    except ValueError as e:
        print(f"Stratification failed: {e}")
        print("Using regular train-test split...")
        X_train, X_test, y_train, y_test = train_test_split(
            df_processed[TEXT_COL],
            df_processed[LABEL_COL],
            test_size=TEST_SIZE,
            random_state=RANDOM_STATE
        )

    # Create and evaluate model
    pipeline = create_model_pipeline()
    trained_pipeline, final_accuracy = evaluate_model(pipeline, X_train, X_test, y_train, y_test)

    print(f"\n" + "="*60)
    print(f"FINAL RESULTS")
    print(f"="*60)
    print(f"Final Test Accuracy: {final_accuracy*100:.2f}%")
    print(f"Classes processed: {len(df_processed[LABEL_COL].unique())}")
    print(f"Total samples: {len(df_processed)}")

    # Save the trained model (with version metadata)
    model_filename = "resume_classifier_v2.pkl"
    metadata = {
        "model": trained_pipeline,
        "sklearn_version": sklearn.__version__
    }
    joblib.dump(metadata, model_filename)

    print(f"\n✅ Model saved to: {model_filename} (Scikit-learn {sklearn.__version__})")
    print(f"\nTo use in your app (same as before):")
    print("  import joblib")
    print("  data = joblib.load('resume_classifier_v2.pkl')")
    print("  model = data['model']  # Works with your existing code")
    print("  category = model.predict([resume_text])[0]")

    # model_filename = 'resume_classifier.pkl'
    # joblib.dump(trained_pipeline, model_filename)
    # print(f"\n✅ Model saved to: {model_filename}")
    print(f"\nTo use in your app:")
    print(f"  import joblib")
    print(f"  model = joblib.load('{model_filename}')")
    print(f"  category = model.predict([resume_text])[0]")

    return trained_pipeline

if __name__ == "__main__":
    main()

Resume Classification with Improved Pipeline
Loaded 962 samples from UpdatedResumeDataSet.csv
Original data shape: (962, 2)
After cleaning (duplicates kept): (962, 2)

Original class distribution:
Category
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Sales                        40
Data Science                 40
Mechanical Engineer          40
ETL Developer                40
Blockchain                   40
Operations Manager           40
Arts                         36
Database                     33
Health and fitness           30
PMO                          30
Electrical Engineering       30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
Civil Engineer               24
SAP Developer                24
Advocate                  

In [None]:
# Download the model
from google.colab import files
files.download("resume_classifier_v2.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Performance Metrices

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("UpdatedResumeDataSet.csv")[["Resume", "Category"]]
X_train, X_test, y_train, y_test = train_test_split(
    df["Resume"], df["Category"], test_size=0.2, stratify=df["Category"], random_state=42
)


In [None]:
y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("Accuracy:", round(accuracy, 3))
print("Precision:", round(precision, 3))
print("Recall:", round(recall, 3))
print("F1 Score:", round(f1, 3))


Accuracy: 0.995
Precision: 0.996
Recall: 0.995
F1 Score: 0.995


In [None]:
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



Classification Report:
                            precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         4
                     Arts       1.00      1.00      1.00         7
       Automation Testing       0.83      1.00      0.91         5
               Blockchain       1.00      1.00      1.00         8
         Business Analyst       1.00      1.00      1.00         6
           Civil Engineer       1.00      1.00      1.00         5
             Data Science       1.00      1.00      1.00         8
                 Database       1.00      1.00      1.00         7
          DevOps Engineer       1.00      0.91      0.95        11
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         8
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       1.00      1.00      1.00         9
                   Hadoop       1.00