In [None]:
import pandas as pd
import numpy as np


In [None]:
df = pd.read_csv('aws_training_data.csv')

In [None]:
df.info()

In [None]:
df.describe()


In [None]:
df['job_application'].value_counts()


In [None]:
import pandas as pd
import numpy as np
import re
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter

# ----------------------------
# TEXT CLEANING
# ----------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z ]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# ----------------------------
# LOAD DATA
# ----------------------------
df = pd.read_csv("aws_training_data.csv")
# df["email_text"] = df["email_text"].astype(str).apply(clean_text)

print("\nClass Distribution:")
print(df['category'].value_counts())

# ----------------------------
# ENCODE LABELS
# ----------------------------
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["category"])

# ----------------------------
# TOKENIZATION
# ----------------------------
MAX_WORDS = 8000
MAX_LEN = 300

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df["email_text"])

X = tokenizer.texts_to_sequences(df["email_text"])
X = pad_sequences(X, maxlen=MAX_LEN)

# ----------------------------
# TRAIN / TEST SPLIT
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ----------------------------
# HANDLE CLASS IMBALANCE
# ----------------------------
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))
print("\nClass Weights:", class_weights)

# ----------------------------
# BUILD MODEL
# ----------------------------
model = Sequential([
    Embedding(MAX_WORDS, 128, input_length=MAX_LEN),
    GlobalMaxPooling1D(),
    Dense(128, activation="relu"),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(len(label_encoder.classes_), activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

# ----------------------------
# TRAIN MODEL
# ----------------------------
history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=16,
    class_weight=class_weights,
    verbose=1
)

# ----------------------------
# EVALUATION
# ----------------------------
y_pred = np.argmax(model.predict(X_test), axis=1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# ----------------------------
# CONFUSION MATRIX
# ----------------------------
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d",
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_,
            cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# ----------------------------
# SAVE MODEL
# ----------------------------
model.save("email_classifier.h5")
pickle.dump(tokenizer, open("tokenizer.pkl", "wb"))
pickle.dump(label_encoder, open("label_encoder.pkl", "wb"))

print("\nModel saved successfully!")

# ----------------------------
# MANUAL EMAIL TESTING
# ----------------------------
def predict_email(email):
    email = clean_text(email)
    seq = tokenizer.texts_to_sequences([email])
    padded = pad_sequences(seq, maxlen=MAX_LEN)
    pred = model.predict(padded)
    label = label_encoder.inverse_transform([np.argmax(pred)])
    confidence = np.max(pred)
    return label[0], confidence

# ----------------------------
# AUTOMATED REPLIES
# ----------------------------
reply_templates = {
    "new_requisition": "Thank you for the new job requisition. We will begin sourcing candidates.",
    "interview_scheduling": "Your interview will be scheduled shortly. We will share the details.",
    "candidate_selection": "The candidate has been shortlisted. We will proceed further.",
    "job_application": "Thank you for your application. Our team will review it shortly.",
    "spam": "This email has been identified as spam."
}

# ----------------------------
# TEST EMAIL
# ----------------------------
test_email = "I want to apply for data analyst role. Offer offer buy car cheap."

category, confidence = predict_email(test_email)

print("\nManual Test Email:")
print(test_email)
print("Predicted Category:", category)
print("Confidence:", round(confidence, 2))
print("Auto Reply:", reply_templates.get(category))


# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn 
import re

In [None]:
df = pd.read_csv('aws_training_data.csv')

In [None]:
df.info()
# cretate columns for data analysis 
df.describe()
df.value_counts()
df['job_application'].value_counts()



In [None]:
# i want to message for spam email and job application email give me method to do that
df.describe()


In [None]:
df.value_counts().unique()

In [None]:
import pandas as pd

# 1. Load the old file. 
# Since your example shows the label first, we assign names 'label' and 'text'
old_df = pd.read_csv("aws_training_data.csv", names=["label", "text"], header=None)

# 2. Load the new synthetic file (which already has headers)
new_df = pd.read_csv("email_dataset.csv")

# 3. Combine them
combined_df = pd.concat([old_df, new_df], ignore_index=True)

# 4. Shuffle to mix the old and new data together
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

# 5. Save the final version
combined_df.to_csv("final_training_data.csv", index=False)

print(f"Merge complete! Total training samples: {len(combined_df)}")
print(combined_df.head()) # Preview the first 5 rows

In [None]:
df = pd.read_csv('final_training_data.csv')

In [None]:
df.describe()
df.value_counts()
df.info()
df['label'].value_counts()

In [10]:
import pandas as pd
import re
import hashlib

def clean_enron_professional(enron_emails, cleaned_ats_enron):
    print("Loading Enron CSV...")
    df = pd.read_csv(enron_emails)

    def advanced_clean(msg):
        # 1. Split Header from Body
        parts = msg.split('\n\n', 1)
        body = parts[1] if len(parts) > 1 else parts[0]
        
        # 2. Remove Forwarded/Original Message Chains
        # This stops the model from reading the same old text over and over
        body = re.split(r'-----Original Message-----|From:|>|To:', body)[0]
        
        # 3. Basic text cleaning
        body = body.lower()
        body = re.sub(r'\s+', ' ', body) # Remove extra whitespace/newlines
        return body.strip()

    print("Stripping threads and cleaning...")
    df['clean_text'] = df['message'].apply(advanced_clean)

    # 4. De-duplication: Remove exact duplicates
    # Many emails are the same but have different Message-IDs
    initial_count = len(df)
    df = df.drop_duplicates(subset=['clean_text'])
    print(f"Removed {initial_count - len(df)} duplicate emails.")

    # 5. Labeling Logic (Optimized for HR/ATS)
    def assign_label(text):
        # Job Application
        if any(w in text for w in ['resume attached', 'applying for', 'cv', 'cover letter']):
            return 'job_application'
        # Interview Scheduling
        if any(w in text for w in ['interview', 'availability', 'schedule', 'calendar invitation']):
            return 'interview_scheduling'
        # New Requisition
        if any(w in text for w in ['open req', 'requisition', 'headcount', 'position approval']):
            return 'new_requisition'
        # Candidate Selection
        if any(w in text for w in ['finalist', 'shortlist', 'hired', 'selection', 'background check']):
            return 'candidate_selection'
        # Spam
        if any(w in text for w in ['win', 'prize', 'viagra', 'account suspended', 'click here']):
            return 'spam'
        return 'other'

    df['label'] = df['clean_text'].apply(assign_label)
    
    # 6. Final Filter and Sample
    final_df = df[df['label'] != 'other'][['clean_text', 'label']]
    
    # Ensure classes are balanced
    min_count = final_df['label'].value_counts().min()
    balanced_df = final_df.groupby('label').head(min_count)

    balanced_df.to_csv(cleaned_ats_enron, index=False)
    print(f"Successfully saved {len(balanced_df)} balanced, unique rows to {cleaned_ats_enron}")

# To execute:
clean_enron_professional('enron_emails.csv', 'cleaned_ats_enron.csv')

Loading Enron CSV...
Stripping threads and cleaning...
Removed 289923 duplicate emails.
Successfully saved 935 balanced, unique rows to cleaned_ats_enron.csv


In [31]:
df = pd.read_csv('cleaned_ats_enron.csv')
df.head()
# df['label'].value_counts()
# df.shape

Unnamed: 0,clean_text,label
0,"randy, can you send me a schedule of the salar...",interview_scheduling
1,please cc the following distribution list with...,spam
2,---------------------- forwarded by phillip k ...,interview_scheduling
3,"reagan, just wanted to give you an update. i h...",spam
4,nymex expiration is during this time frame. pl...,interview_scheduling


In [29]:
if(df['label'].index==1):
    print("yes")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, GlobalMaxPooling1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if needed
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# class ImprovedATSEmailClassifier:
#     def __init__(self, max_features=10000, max_length=500):
#         self.max_features = max_features
#         self.max_length = max_length
#         self.tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
#         self.label_encoder = LabelEncoder()
#         self.model = None
#         self.num_classes = None

#     def preprocess_text(self, text):
#         """Clean text: lowercase, remove punctuation, stop words."""
#         text = text.lower()
#         text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
#         text = ' '.join([word for word in text.split() if word not in stop_words])
#         return text

#     def prepare_data(self, texts, labels):
#         """Prepare data with preprocessing."""
#         texts = [self.preprocess_text(t) for t in texts]
#         # self.tokenizer.fit_on_texts(texts)
#         if is_training:
#             self.tokenizer.fit_on_texts(texts)
#         sequences = self.tokenizer.texts_to_sequences(texts)
#         X = pad_sequences(sequences, maxlen=self.max_length)
#         y = self.label_encoder.fit_transform(labels)
#         self.num_classes = len(np.unique(y))
#         return X, y

#     def build_model(self):
#         """Improved model: Bidirectional LSTM for better sequence handling."""
#         model = Sequential([
#             Embedding(self.max_features, 128, input_length=self.max_length),
#             Bidirectional(LSTM(64, return_sequences=True)),  # Better for context
#             GlobalMaxPooling1D(),
#             Dense(128, activation='relu'),
#             Dropout(0.5),
#             Dense(64, activation='relu'),
#             Dropout(0.5),
#             Dense(self.num_classes, activation='softmax')
#         ])
#         model.compile(
#             optimizer='adam',
#             loss='sparse_categorical_crossentropy',
#             metrics=['accuracy']
#         )
#         self.model = model
#         return model

#     def train(self, texts, labels, validation_split=0.2, epochs=10, batch_size=32):
#         X, y = self.prepare_data(texts, labels)
#         self.build_model()
        
#         # Class weights for imbalance
#         class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
#         class_weight_dict = dict(enumerate(class_weights))
        
#         # Early stopping to prevent overfitting
#         early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
        
#         history = self.model.fit(
#             X, y,
#             validation_split=validation_split,
#             epochs=10,
#             batch_size=batch_size,
#             class_weight=class_weight_dict,
#             callbacks=[early_stop],
#             verbose=1
#         )
#         return history

#     def predict(self, texts):
#         texts = [self.preprocess_text(t) for t in texts]
#         sequences = self.tokenizer.texts_to_sequences(texts)
#         X = pad_sequences(sequences, maxlen=self.max_length)
#         predictions = self.model.predict(X)
#         predicted_classes = np.argmax(predictions, axis=1)
#         return self.label_encoder.inverse_transform(predicted_classes)

#     def evaluate(self, texts, labels):
#         texts = [self.preprocess_text(t) for t in texts]
#         sequences = self.tokenizer.texts_to_sequences(texts)
#         X = pad_sequences(sequences, maxlen=self.max_length)
#         y = self.label_encoder.transform(labels)
#         predictions = self.model.predict(X)
#         predicted_classes = np.argmax(predictions, axis=1)
        
#         print("Classification Report:")
#         print(classification_report(y, predicted_classes, target_names=self.label_encoder.classes_))
        
#         cm = confusion_matrix(y, predicted_classes)
#         plt.figure(figsize=(12, 8))
#         sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
#                    xticklabels=self.label_encoder.classes_,
#                    yticklabels=self.label_encoder.classes_)
#         plt.title('Confusion Matrix')
#         plt.ylabel('Actual')
#         plt.xlabel('Predicted')
#         plt.xticks(rotation=45)
#         plt.yticks(rotation=0)
#         plt.tight_layout()
#         plt.show()
class ImprovedATSEmailClassifier:
    def __init__(self, max_features=10000, max_length=500):
        self.max_features = max_features
        self.max_length = max_length
        self.tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
        self.label_encoder = LabelEncoder()
        self.model = None
        self.num_classes = None

    def preprocess_text(self, text):
        """Clean text: lowercase, remove punctuation, stop words."""
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = ' '.join([word for word in text.split() if word not in stop_words])
        return text

    def prepare_data(self, texts, labels=None, is_training=True):
        """Prepare data with preprocessing. Added is_training logic."""
        processed_texts = [self.preprocess_text(t) for t in texts]
        
        # FIX: Only fit the dictionary during the training phase
        if is_training:
            self.tokenizer.fit_on_texts(processed_texts)
        
        sequences = self.tokenizer.texts_to_sequences(processed_texts)
        X = pad_sequences(sequences, maxlen=self.max_length)
        
        if labels is not None:
            if is_training:
                y = self.label_encoder.fit_transform(labels)
                self.num_classes = len(np.unique(y))
            else:
                y = self.label_encoder.transform(labels)
            return X, y
        return X

    def build_model(self):
        """Improved model architecture with higher Dropout to fight overfitting."""
        model = Sequential([
            Embedding(self.max_features, 128, input_length=self.max_length),
            Bidirectional(LSTM(64, return_sequences=True)),
            GlobalMaxPooling1D(),
            Dense(128, activation='relu'),
            Dropout(0.5), # Forces model to ignore some keywords
            Dense(64, activation='relu'),
            Dropout(0.5), # Increased to prevent memorization
            Dense(self.num_classes, activation='softmax')
        ])
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        self.model = model
        return model

    def train(self, texts, labels, validation_split=0.2, epochs=10, batch_size=32):
        # Pass is_training=True here
        X, y = self.prepare_data(texts, labels, is_training=True)
        self.build_model()
        
        class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
        class_weight_dict = dict(enumerate(class_weights))
        
        # Early stopping: Stops if the model stops actually learning
        early_stop = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', 
            patience=3, 
            restore_best_weights=True
        )
        
        history = self.model.fit(
            X, y,
            validation_split=validation_split,
            epochs=epochs,
            batch_size=batch_size,
            class_weight=class_weight_dict,
            callbacks=[early_stop],
            verbose=1
        )
        return history

    def predict(self, texts):
        # Pass is_training=False to keep the tokenizer dictionary locked
        X = self.prepare_data(texts, labels=None, is_training=False)
        predictions = self.model.predict(X)
        predicted_classes = np.argmax(predictions, axis=1)
        return self.label_encoder.inverse_transform(predicted_classes)

    def evaluate(self, texts, labels):
        # Pass is_training=False for evaluation
        X, y = self.prepare_data(texts, labels, is_training=False)
        predictions = self.model.predict(X)
        predicted_classes = np.argmax(predictions, axis=1)
        print(classification_report(y, predicted_classes, target_names=self.label_encoder.classes_))

# Load data from CSV (assuming columns are 'category' and the email text column; adjust if needed)
df = pd.read_csv('cleaned_ats_enron.csv')
df.columns = ['label', 'clean_text	']  # Rename if necessary
df = df.dropna(subset=['clean_text'])
texts = df['clean_text'].tolist()
labels = df['label'].tolist()

# Split data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

# Train model
classifier = ImprovedATSEmailClassifier()
history = classifier.train(X_train, y_train, epochs=10)

# Evaluate
classifier.evaluate(X_test, y_test)

# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Test prediction
test_email = "I am applying for the data analyst position. Attached is my resume."
print(f"Predicted Category: {classifier.predict([test_email])[0]}")
test_email = "I am applying for the offer offer buy this car get a gift free. Attached is my resume."
print(f"Predicted Category: {classifier.predict([test_email])[0]}") 

  if not hasattr(np, "object"):
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tanma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyError: ['clean_text']

# Merging the enron cleaned data and the final_ats_data
