# Email Classification 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import re

# Load your preprocessed data
df = pd.read_csv("aws_training_data.csv")

print("=== RECRUITMENT EMAIL DATASET ===")
print(f"Total emails: {len(df)}")
print(f"Available columns: {list(df.columns)}")

=== RECRUITMENT EMAIL DATASET ===
Total emails: 1575
Available columns: ['job_application', 'Application for Data Analyst – Amit Sharma\r\n\r\nHi HR Team,\r\n\r\nMy name is Amit Sharma, and I am currently working as a Data Analyst at Quantiva Systems. I have over 4 years of experience in data visualization, SQL-based analytics, and building dashboards using Power BI and Tableau.\r\n\r\nI came across your opening for a Data Analyst, and I believe my background aligns strongly with the role. I`ve handled several reporting automation initiatives and worked closely with cross-functional teams to deliver insights that improved operational decision-making.\r\n\r\nPlease find my attached résumé for your review. I would be happy to discuss how I can contribute.\r\n\r\nRegards,\r\nAmit Sharma\r\nEmail: amit.sharma92@gmail.com\r\n\r\n\r\nPhone: +91-98234-10020']


In [2]:
df.describe()


Unnamed: 0,job_application,"Application for Data Analyst – Amit Sharma\r\n\r\nHi HR Team,\r\n\r\nMy name is Amit Sharma, and I am currently working as a Data Analyst at Quantiva Systems. I have over 4 years of experience in data visualization, SQL-based analytics, and building dashboards using Power BI and Tableau.\r\n\r\nI came across your opening for a Data Analyst, and I believe my background aligns strongly with the role. I`ve handled several reporting automation initiatives and worked closely with cross-functional teams to deliver insights that improved operational decision-making.\r\n\r\nPlease find my attached résumé for your review. I would be happy to discuss how I can contribute.\r\n\r\nRegards,\r\nAmit Sharma\r\nEmail: amit.sharma92@gmail.com\r\n\r\n\r\nPhone: +91-98234-10020"
count,1575,1575
unique,5,1526
top,new_requisition,Subject: You’re Selected – UI Designer at Pixe...
freq,516,4


In [3]:
df['category'].value_counts()


KeyError: 'category'

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter

  if not hasattr(np, "object"):


In [5]:
class ATSEmailClassifier:
    def __init__(self, max_features=8000, max_length=400):
        self.max_features = max_features
        self.max_length = max_length
        self.tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
        self.label_encoder = LabelEncoder()
        self.model = None
        self.num_classes = None

In [6]:
    def prepare_data(self, texts, labels):
        """Prepare data for training"""
        # Fit tokenizer
        self.tokenizer.fit_on_texts(texts)

        # Convert texts to sequences
        sequences = self.tokenizer.texts_to_sequences(texts)
        X = pad_sequences(sequences, maxlen=self.max_length)

        # Encode labels
        y = self.label_encoder.fit_transform(labels)
        self.num_classes = len(np.unique(y))

        return X, y

In [7]:
    def build_model(self):
        """Build the neural network model"""
        model = Sequential([
            Embedding(self.max_features, 128, input_length=self.max_length),
            GlobalMaxPooling1D(),
            Dense(128, activation='relu'),
            Dropout(0.5),
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(self.num_classes, activation='softmax')
        ])

        model.compile(
            optimizer='adam',
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        self.model = model
        return model

In [8]:
    def train(self, texts, labels, validation_split=0.2, epochs=25, batch_size=16):
        """Train the model"""
        X, y = self.prepare_data(texts, labels)

        # Build model
        self.build_model()

        # Compute class weights to handle imbalance
        class_weights = compute_class_weight(
            'balanced',
            classes=np.unique(y),
            y=y
        )
        class_weight_dict = dict(enumerate(class_weights))

        print(f"Training model with {self.num_classes} classes...")
        print(f"Vocabulary size: {len(self.tokenizer.word_index)}")
        print(f"Sample distribution: {Counter(y)}")

        history = self.model.fit(
            X, y,
            validation_split=validation_split,
            epochs=epochs,
            batch_size=batch_size,
            class_weight=class_weight_dict,
            verbose=1
        )

        return history

In [10]:
    def predict(self, texts):
        """Predict email categories"""
        sequences = self.tokenizer.texts_to_sequences(texts)
        X = pad_sequences(sequences, maxlen=self.max_length)

        predictions = self.model.predict(X)
        predicted_classes = np.argmax(predictions, axis=1)

        return self.label_encoder.inverse_transform(predicted_classes)

    def predict_proba(self, texts):
        """Get prediction probabilities"""
        sequences = self.tokenizer.texts_to_sequences(texts)
        X = pad_sequences(sequences, maxlen=self.max_length)

        predictions = self.model.predict(X)
        return predictions

In [11]:
    def evaluate(self, texts, labels):
        """Evaluate model performance"""
        sequences = self.tokenizer.texts_to_sequences(texts)
        X = pad_sequences(sequences, maxlen=self.max_length)
        y = self.label_encoder.transform(labels)

        predictions = self.model.predict(X)
        predicted_classes = np.argmax(predictions, axis=1)

        print("Classification Report:")
        print(classification_report(y, predicted_classes))

        # Confusion Matrix
        cm = confusion_matrix(y, predicted_classes)
        plt.figure(figsize=(12, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=self.label_encoder.classes_,
                   yticklabels=self.label_encoder.classes_)
        plt.title('Confusion Matrix')
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        plt.xticks(rotation=45)
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.show()

        return classification_report(y, predicted_classes, output_dict=True)

In [12]:
    def save_model(self, filepath):
        """Save the trained model"""
        self.model.save(f"{filepath}_model.h5")
        with open(f"{filepath}_tokenizer.pkl", 'wb') as f:
            pickle.dump(self.tokenizer, f)
        with open(f"{filepath}_label_encoder.pkl", 'wb') as f:
            pickle.dump(self.label_encoder, f)

    def load_model(self, filepath):
        """Load a trained model"""
        self.model = tf.keras.models.load_model(f"{filepath}_model.h5")
        with open(f"{filepath}_tokenizer.pkl", 'rb') as f:
            self.tokenizer = pickle.load(f)
        with open(f"{filepath}_label_encoder.pkl", 'rb') as f:
            self.label_encoder = pickle.load(f)

In [13]:
def categorize_emails(df):
    """Categorize emails based on content patterns with fallback"""
    def classify_email(text):
        if pd.isna(text):
            return 'other'

        text_lower = str(text).lower()

        # Define keyword patterns for each category
        patterns = {
            'candidate_application': ['application', 'applied', 'resume', 'cv', 'portfolio', 'candidate profile'],
            'candidate_availability': ['availability', 'schedule', 'time', 'when', 'flexible', 'available'],
            'candidate_response': ['response', 'reply', 'answering', 'responding', 'acknowledging'],
            'interview_scheduling': ['interview', 'meeting', 'schedule', 'set up', 'arrange', 'book', 'appointment'],
            'interview_feedback': ['feedback', 'review', 'after', 'post', 'interview feedback'],
            'interview_confirmation': ['confirm', 'confirmed', 'yes', 'ok', 'accepted', 'confirmed interview'],
            'offer_sent': ['offer', 'salary', 'package', 'compensation', 'proposed', 'position offer'],
            'offer_response': ['accept', 'accepted', 'decline', 'declined', 'negotiate', 'negotiation'],
            'client_submission': ['submit', 'send', 'forward', 'present', 'candidate submission'],
            'client_feedback': ['feedback', 'review', 'comment', 'client feedback', 'client response'],
            'new_requisition': ['new', 'open', 'requirement', 'job requisition', 'position opening'],
            'onboarding': ['onboarding', 'documents', 'start date', 'joining', 'orientation'],
            'internal_coordination': ['internal', 'team', 'colleagues', 'coordinator', 'internal email'],
            'system_notification': ['system', 'automated', 'notification', 'email notification'],
            'general_inquiry': ['hello', 'hi', 'greeting', 'inquiry', 'question', 'help'],
            'spam': ['spam', 'unsubscribe', 'advertisement', 'promotion', 'marketing'],
        }

        # Check each pattern
        for category, keywords in patterns.items():
            if any(keyword in text_lower for keyword in keywords):
                return category

        # Default to general inquiry if no specific pattern matches
        return 'general_inquiry'

    # Apply classification
    df['category'] = df['full_preprocessed'].apply(classify_email)
    return df

In [15]:
def filter_rare_categories(df, min_samples=5):
    """Filter out categories with very few samples"""
    category_counts = df['category'].value_counts()
    valid_categories = category_counts[category_counts >= min_samples].index
    filtered_df = df[df['category'].isin(valid_categories)].copy()

    print(f"Original categories: {len(category_counts)}")
    print(f"Valid categories (≥{min_samples} samples): {len(valid_categories)}")
    print(f"Samples kept: {len(filtered_df)} out of {len(df)}")
    print("\nCategory distribution after filtering:")
    print(filtered_df['category'].value_counts())

    return filtered_df

In [16]:
def train_classifier(df):
    """Train the ATS email classifier"""
    # Categorize emails
    df = categorize_emails(df)

    # Filter out rare categories
    df_filtered = filter_rare_categories(df, min_samples=3)

    if len(df_filtered) < 10:  # Need minimum samples for training
        print("Not enough data after filtering. Using simplified approach...")
        # Fallback: group rare categories
        category_counts = df['category'].value_counts()
        threshold = 5
        df['category'] = df['category'].apply(
            lambda x: 'other' if category_counts[x] < threshold else x
        )
        df_filtered = df[df['category'] != 'other']  # Remove 'other' for training

        if len(df_filtered) < 10:
            df_filtered = df  # Use all data if still too small

    # Prepare data
    texts = df_filtered['full_preprocessed'].fillna('').tolist()
    labels = df_filtered['category'].tolist()

    # Check if we have enough samples for each class
    unique_labels, counts = np.unique(labels, return_counts=True)
    min_count = min(counts)

    if min_count < 2:
        print("Some classes have only 1 sample. Using stratified split with validation split = 0.1")
        validation_split = 0.1
    else:
        validation_split = 0.2 if len(texts) > 50 else 0.1

    # Split data ensuring minimum samples per class
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            texts, labels, test_size=0.2, random_state=42,
            stratify=labels if min_count >= 2 else None
        )
    except:
        # Fallback if stratification fails
        X_train, X_test, y_train, y_test = train_test_split(
            texts, labels, test_size=0.2, random_state=42
        )

    # Initialize and train classifier
    classifier = ATSEmailClassifier(max_features=8000, max_length=400)

    print(f"\n=== TRAINING ATS EMAIL CLASSIFIER ===")
    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    print(f"Unique categories: {len(set(y_train))}")

    history = classifier.train(X_train, y_train, epochs=30, batch_size=16)

    # Evaluate on test set
    print(f"\n=== EVALUATION RESULTS ===")
    classifier.evaluate(X_test, y_test)

    # Plot training history
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

    return classifier

In [17]:
def create_reply_templates():
    """Create automated reply templates based on categories"""
    reply_templates = {
        'candidate_application': "Thank you for your application. We have received your resume and will review it. We'll get back to you within 3-5 business days.",
        'candidate_availability': "Thank you for providing your availability. We'll schedule your interview soon and send you the details.",
        'candidate_response': "Thank you for your response. We appreciate your interest in this position.",
        'interview_scheduling': "We'd like to schedule your interview. Please confirm your availability for the proposed time.",
        'interview_feedback': "Thank you for attending the interview. We'll provide feedback soon.",
        'interview_confirmation': "Thank you for confirming your interview. We look forward to meeting you.",
        'offer_sent': "Congratulations! We'd like to offer you the position. Please review the offer details and let us know your decision.",
        'offer_response': "Thank you for your response regarding the offer. We'll proceed accordingly.",
        'client_submission': "We have submitted the candidate profile to the client. We'll update you on their feedback.",
        'client_feedback': "Thank you for the client feedback. We'll take it into consideration.",
        'new_requisition': "Thank you for the new job requisition. We'll start sourcing suitable candidates immediately.",
        'onboarding': "Please complete the onboarding documents attached. Let us know if you need any assistance.",
        'internal_coordination': "Internal coordination email processed successfully.",
        'system_notification': "System notification processed.",
        'general_inquiry': "Thank you for your inquiry. We'll address your query and get back to you soon.",
        'spam': "This email has been marked as spam and will be handled accordingly.",
        'other': "Thank you for your message. We'll review it and respond accordingly."
    }
    return reply_templates

In [18]:
def main():
    # Load your data
    df = pd.read_csv('aws_training_data.csv')

    # Rename columns: assume second column is the email text
    df = df.rename(columns={df.columns[1]: 'full_preprocessed'})
    df = df[['full_preprocessed']]  # Keep only the text column

    print("=== DATASET LOADED ===")
    print(f"Total emails: {len(df)}")
    print(f"Columns: {list(df.columns)}")

    # Remove any rows with missing full_preprocessed text
    df = df.dropna(subset=['full_preprocessed'])
    print(f"After cleaning: {len(df)} emails")

    # Train the classifier
    classifier = train_classifier(df)

    # Save the model
    classifier.save_model("ats_email_classifier")

    # Create reply templates
    reply_templates = create_reply_templates()

    # Test with sample emails
    test_emails = [
        "I have attached my resume for the software engineer position. Please let me know if you need any additional information.",
        "I'm available for an interview next week on Tuesday or Wednesday. Please let me know what works for you.",
        "Thank you for the interview opportunity. I would like to accept your offer for the position.",
        "We have reviewed the candidate's profile and would like to schedule an interview next week.",
        "Please find the attached onboarding documents for the new hire."
    ]

    print("\n=== TEST PREDICTIONS AND AUTOMATED REPLIES ===")
    for i, email in enumerate(test_emails, 1):
        print(f"\n--- Test Email {i} ---")
        print(f"Email: {email[:100]}...")
        reply = generate_automated_reply(classifier, email, reply_templates)
        print("-" * 50)

    return classifier, reply_templates

In [20]:
import os
print(os.listdir("."))

['.git', '.venv', 'aws_training_data.csv', 'email_classification.ipynb', 'email_classification1.ipynb', 'email_classification_2.ipynb', 'email_classification_3.ipynb', 'email_classification_bert.ipynb', 'final_model', 'logs', 'README.md', 'results']
