In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import re
import joblib

# Load dataset
try:
    df = pd.read_json(r"\Users\DELL\Downloads\email_dataset2000.json")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

# Define regex rules
regex_rules = {
    "full_name": r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+){1,2})\b",
    "email": r"\b[\w\.-]+@[\w\.-]+\.\w+\b",
    "phone_number": r"\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{3,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{4}(?:x\d+)?\b",
    "dob": r"\b(?:19|20)\d{2}[-/]\d{2}[-/]\d{2}\b",
    "aadhar_num": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",
    "credit_debit_no": r"\b(?:\d{4}[-\s]?){3}\d{4}\b",
    "cvv_no": r"\b(?<!\d)\d{3}(?!\d)\b",
    "expiry_no": r"\b(0[1-9]|1[0-2])\/?([0-9]{2})\b"
}

# Masking function with entity tracking
def apply_regex_masking_with_entities(text):
    if not isinstance(text, str):
        return "", []
    masked = text
    entities = []
    for entity_type, pattern in regex_rules.items():
        matches = [(m.start(), m.end(), m.group(), entity_type) for m in re.finditer(pattern, text)]
        for start, end, value, etype in matches:
            entities.append({"position": [start, end], "classification": etype, "entity": value})
            masked = masked[:start] + f"[{etype}]" + masked[end:]
    return masked, entities

# Apply masking
df['masked_email_text'], df['entities'] = zip(*df['email_text'].apply(apply_regex_masking_with_entities))
df['masked_email_text'] = df['masked_email_text'].fillna("")
df = df[df['masked_email_text'].str.strip() != ""]
if df.empty:
    print("Error: All masked_email_text entries are empty.")
    exit()

# Data splitting
X = df['masked_email_text']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, min_df=2)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train SVM
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = svm_model.predict(X_test_tfidf)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save model and vectorizer
joblib.dump(svm_model, 'svm_email_classifier.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("Model and vectorizer saved to 'svm_email_classifier.pkl' and 'tfidf_vectorizer.pkl'.")

Test Accuracy: 0.3100
Classification Report:
                    precision    recall  f1-score   support

Account Management       0.37      0.45      0.40       139
    Billing Issues       0.34      0.26      0.29       134
 Technical Support       0.21      0.21      0.21       127

          accuracy                           0.31       400
         macro avg       0.31      0.31      0.30       400
      weighted avg       0.31      0.31      0.31       400

Model and vectorizer saved to 'svm_email_classifier.pkl' and 'tfidf_vectorizer.pkl'.


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, chi2
import re
import joblib
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')

# Load dataset
try:
    df = pd.read_json(r"\Users\DELL\Downloads\email_dataset2000.json")
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

# Define regex rules
regex_rules = {
    "full_name": r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+){1,2})\b",
    "email": r"\b[\w\.-]+@[\w\.-]+\.\w+\b",
    "phone_number": r"\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{3,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{4}(?:x\d+)?\b",
    "dob": r"\b(?:19|20)\d{2}[-/]\d{2}[-/]\d{2}\b",
    "aadhar_num": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",
    "credit_debit_no": r"\b(?:\d{4}[-\s]?){3}\d{4}\b",
    "cvv_no": r"\b(?<!\d)\d{3}(?!\d)\b",
    "expiry_no": r"\b(0[1-9]|1[0-2])\/?([0-9]{2})\b"
}

# Masking function with entity tracking
def apply_regex_masking_with_entities(text):
    if not isinstance(text, str):
        return "", []
    masked = text
    entities = []
    for entity_type, pattern in regex_rules.items():
        matches = [(m.start(), m.end(), m.group(), entity_type) for m in re.finditer(pattern, text)]
        for start, end, value, etype in matches:
            entities.append({"position": [start, end], "classification": etype, "entity": value})
            masked = masked[:start] + f"[{etype}]" + masked[end:]
    return masked, entities

# Preprocessing with lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = text.lower().split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply masking and preprocessing
df['masked_email_text'], df['entities'] = zip(*df['email_text'].apply(apply_regex_masking_with_entities))
df['masked_email_text'] = df['masked_email_text'].fillna("")
df['masked_email_text'] = df['masked_email_text'].apply(preprocess_text)
df = df[df['masked_email_text'].str.strip() != ""]
if df.empty:
    print("Error: All masked_email_text entries are empty.")
    exit()

# Data splitting
X = df['masked_email_text']
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorization with improved parameters
vectorizer = TfidfVectorizer(max_features=2000, min_df=2, max_df=0.9, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Feature selection
selector = SelectKBest(chi2, k=1000)
X_train_selected = selector.fit_transform(X_train_tfidf, y_train)
X_test_selected = selector.transform(X_test_tfidf)

# Hyperparameter tuning with GridSearchCV
# Define kernel options first
kernels = ['linear', 'rbf']
# Define param_grid without self-referencing
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': kernels,
    'gamma': ['scale', 'auto', 0.1, 1]  # gamma applies to 'rbf'; GridSearchCV will ignore it for 'linear'
}
svm_model = SVC(class_weight='balanced', probability=True)
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_selected, y_train)

# Best model
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# Evaluate on test set
y_pred = best_model.predict(X_test_selected)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save model, vectorizer, and selector
joblib.dump(best_model, 'svm_email_classifier.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(selector, 'feature_selector.pkl')
print("Model, vectorizer, and selector saved to 'svm_email_classifier.pkl', 'tfidf_vectorizer.pkl', and 'feature_selector.pkl'.")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best Cross-Validation Accuracy: 0.4119
Test Accuracy: 0.3225
Classification Report:
                    precision    recall  f1-score   support

Account Management       0.38      0.44      0.41       139
    Billing Issues       0.33      0.29      0.31       134
 Technical Support       0.24      0.23      0.23       127

          accuracy                           0.32       400
         macro avg       0.32      0.32      0.32       400
      weighted avg       0.32      0.32      0.32       400

Model, vectorizer, and selector saved to 'svm_email_classifier.pkl', 'tfidf_vectorizer.pkl', and 'feature_selector.pkl'.
