In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv('/Users/vighneshms/Downloads/Email_classifier/models/masked_emails_with_types.csv')  # Assuming CSV format

# Explore the data
print(df.head())
print(df['type'].value_counts())  # Check class distribution

                                        masked_email      type
0  Subject: unforeseen crash of the data analysis...  Incident
1  Subject: Customer Support Inquiry\n\nSeeking i...   Request
2  Subject: Data Analytics for Investment\n\nI am...   Request
3  Subject: Hospital service problem\n\nMedia dat...  Incident
4  Subject: Security\n\nDear Customer Support, I ...   Request
type
Incident    9586
Request     6860
Problem     5037
Change      2517
Name: count, dtype: int64


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    words = text.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)

df['processed_text'] = df['masked_email'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vighneshms/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vighneshms/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2) 
) # Include bigrams
X = tfidf.fit_transform(df['processed_text'])
y = df['type']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
pip install imblearn

Defaulting to user installation because normal site-packages is not writeable
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [7]:
rf = RandomForestClassifier(
    class_weight='balanced',  # Add class weights
    n_estimators=200,
    max_depth=None,
    min_samples_split=10,
    random_state=42
)
rf.fit(X_res, y_res)
rf_pred = rf.predict(X_test)
print("Random Forest:\n", classification_report(y_test, rf_pred))

Random Forest:
               precision    recall  f1-score   support

      Change       0.92      0.82      0.87       479
    Incident       0.72      0.92      0.81      1920
     Problem       0.71      0.35      0.47      1009
     Request       0.92      0.93      0.93      1392

    accuracy                           0.79      4800
   macro avg       0.82      0.76      0.77      4800
weighted avg       0.79      0.79      0.78      4800



In [8]:
import pickle

# Save the best model and vectorizer
with open('email_classifier.pkl', 'wb') as f:
    pickle.dump(rf, f)  # Assuming RF was best

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [7]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Define base models
estimators = [
    ('svm', SVC(kernel='linear', probability=True)),
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier())
]

# Create stack with logistic regression as final estimator
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(class_weight='balanced'),
    cv=5  # Use cross-validated predictions
)

# Train and evaluate
stack.fit(X_res, y_res)
y_pred = stack.predict(X_test)
print("Stacking:\n", classification_report(y_test, y_pred))

Stacking:
               precision    recall  f1-score   support

      Change       0.91      0.83      0.87       479
    Incident       0.71      0.86      0.78      1920
     Problem       0.61      0.40      0.48      1009
     Request       0.92      0.92      0.92      1392

    accuracy                           0.78      4800
   macro avg       0.79      0.75      0.76      4800
weighted avg       0.77      0.78      0.77      4800



In [2]:
pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.4
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Load your dataset
df = pd.read_csv('/Users/vighneshms/Downloads/Email_classifier/models/masked_emails_with_types.csv')  # Replace with your file path
print(df.head())

# Preprocess text data
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    # Basic preprocessing - expand as needed
    text = text.lower().replace('\n', ' ').replace('\r', '')
    return text

df['processed_text'] = df['masked_email'].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['type'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'], 
    df['label_encoded'], 
    test_size=0.2, 
    random_state=42
)

# Feature extraction
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),  # Use unigrams and bigrams
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train).tocsr()
X_test_tfidf = tfidf.transform(X_test).tocsr()

# Ensure matrices are writeable
X_train_tfidf.sort_indices()
X_test_tfidf.sort_indices()

# ... [rest of your code remains the same until the estimators] ...

# Define base models for stacking - with modified SVM parameters
estimators = [
    ('svm', SVC(kernel='linear', probability=True, class_weight='balanced', random_state=42)),
    ('nb', MultinomialNB()),
    ('rf', RandomForestClassifier(class_weight='balanced', n_estimators=200, random_state=42)),
    ('xgb', XGBClassifier(objective='multi:softmax', 
                         num_class=len(label_encoder.classes_),
                         eval_metric='mlogloss',
                         random_state=42))
]

# Create stacking classifier with modified settings
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    cv=5,
    n_jobs=1,  # Reduce to 1 to avoid parallel processing issues
    passthrough=False  # Don't pass original features to meta-learner
)

# Train the stacking model
try:
    stack.fit(X_train_tfidf.copy(), y_train)  # Explicit copy to ensure writeable
except ValueError as e:
    print(f"Error during fitting: {e}")
    # Fallback to non-sparse if needed
    stack.fit(X_train_tfidf.toarray(), y_train)

# Evaluate
y_pred = stack.predict(X_test_tfidf)
print("Stacking Classifier:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Save the model and preprocessing components
joblib.dump(tfidf, '/Users/vighneshms/Downloads/Email_classifier/models/tfidf_vectorizer_stack.pkl')
joblib.dump(label_encoder, '/Users/vighneshms/Downloads/Email_classifier/models/label_encoder_stack.pkl')
joblib.dump(stack, '/Users/vighneshms/Downloads/Email_classifier/models/intent_classifier_stack.pkl')

# Example prediction function
def predict_intent(email_text):
    # Load saved components
    tfidf = joblib.load('/Users/vighneshms/Downloads/Email_classifier/models/tfidf_vectorizer_stack.pkl')
    model = joblib.load('/Users/vighneshms/Downloads/Email_classifier/models/intent_classifier_stack.pkl')
    le = joblib.load('/Users/vighneshms/Downloads/Email_classifier/models/label_encoder_stack.pkl')
    
    # Preprocess and predict
    processed = preprocess_text(email_text)
    vectorized = tfidf.transform([processed])
    prediction = model.predict(vectorized)
    return le.inverse_transform(prediction)[0]

# Test prediction
sample_email = "We need to update the server configuration"
print(f"\nPredicted intent: {predict_intent(sample_email)}")

                                        masked_email      type
0  Subject: unforeseen crash of the data analysis...  Incident
1  Subject: Customer Support Inquiry\n\nSeeking i...   Request
2  Subject: Data Analytics for Investment\n\nI am...   Request
3  Subject: Hospital service problem\n\nMedia dat...  Incident
4  Subject: Security\n\nDear Customer Support, I ...   Request
Stacking Classifier:
               precision    recall  f1-score   support

      Change       0.89      0.93      0.91       479
    Incident       0.83      0.74      0.78      1920
     Problem       0.59      0.71      0.64      1009
     Request       0.95      0.93      0.94      1392

    accuracy                           0.81      4800
   macro avg       0.81      0.83      0.82      4800
weighted avg       0.82      0.81      0.81      4800


Predicted intent: Change


In [7]:
import pandas as pd
df = pd.read_csv('/Users/vighneshms/Downloads/Email_classifier/models/emails_subject_body_type3.csv')  

In [8]:
from sklearn.model_selection import train_test_split

# Subject-only classification
X_subject = df['subject']
y_subject = df['type']

# Body-only classification
X_body = df['body']
y_body = df['type']

# Split data (70% train, 30% test)
X_subj_train, X_subj_test, y_subj_train, y_subj_test = train_test_split(
    X_subject, y_subject, test_size=0.3, random_state=42
)

X_body_train, X_body_test, y_body_train, y_body_test = train_test_split(
    X_body, y_body, test_size=0.3, random_state=42
)

In [10]:

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Shared preprocessing steps
preprocessor = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),  # Includes unigrams and bigrams
    ))
])

# Subject classifier
subject_clf = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', SVC(
        kernel='linear',
        class_weight='balanced',
        probability=True
    ))
])

# Body classifier
body_clf = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', SVC(
        kernel='linear',
        class_weight='balanced',
        probability=True
    ))
])

In [None]:
# Train subject classifier
# subject_clf.fit(X_subj_train, y_subj_train)
# y_subj_pred = subject_clf.predict(X_subj_test)

# Train body classifier
# body_clf.fit(X_body_train, y_body_train)
# y_body_pred = body_clf.predict(X_body_test)

# Compare performance
# print("=== Subject-Only Performance ===")
# print(classification_report(y_subj_test, y_subj_pred))

# print("\n=== Body-Only Performance ===")
# print(classification_report(y_body_test, y_body_pred))


=== Body-Only Performance ===
              precision    recall  f1-score   support

      Change       0.79      0.86      0.82       762
    Incident       0.74      0.61      0.67      2869
     Problem       0.44      0.59      0.50      1484
     Request       0.93      0.89      0.91      2085

    accuracy                           0.71      7200
   macro avg       0.72      0.74      0.72      7200
weighted avg       0.74      0.71      0.72      7200



In [11]:
# Train subject classifier
subject_clf.fit(X_subj_train, y_subj_train)
y_subj_pred = subject_clf.predict(X_subj_test)

print("=== Subject-Only Performance ===")
print(classification_report(y_subj_test, y_subj_pred))

=== Subject-Only Performance ===
              precision    recall  f1-score   support

      Change       0.43      0.78      0.56       762
    Incident       0.71      0.53      0.60      2869
     Problem       0.41      0.48      0.44      1484
     Request       0.83      0.76      0.80      2085

    accuracy                           0.61      7200
   macro avg       0.59      0.64      0.60      7200
weighted avg       0.65      0.61      0.62      7200

