### Importing required libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Folder containing Excel files
file_path = r'data\labeldata.parquet'

In [3]:
# Reading the data
df = pd.read_parquet(file_path)

print(f'Total Record Count: {df.shape}')
df.head()

Total Record Count: (5853, 4)


Unnamed: 0,id,subject,emailtext,label
0,738a2c78-e9b7-4886-8a1b-7af4c08f3906,RE: Yates Residence - Mechanical Contractor Me...,"Zac Stevenson, PEPRINCIPAL | BUILDING MEPC . ....",0
1,23e4febf-364f-464a-9a3a-80f9d89d0e6b,Re: 6771 - VeLa - Level 4 Slab Penetration Plans,You tooRespectfullyMichael BentleyGeneral Fore...,0
2,f0789ea4-0a5f-46c3-8adb-083766b63740,RE: FW: 13027A - 20240603 - Prl - Columbia + A...,You can update the two 12 ducts and adjust on ...,0
3,09474b0b-bc7f-4a1f-ab95-0ed2c497468a,RE: West Zephyrhills Elementary - Pasco County...,"You can start with Area A. Proceeded by B,C,D ...",0
4,b7438359-ce6c-42f7-bcfd-5690e875dcda,RE: MLW_0001_CRS_24410_ Connacht Stadium - Exi...,You can hold off on doing this for now.,0


In [4]:
df.label.value_counts()

label
0    5798
1      55
Name: count, dtype: int64

### Data Cleaning

In [5]:
import re 
# Coverting the email text to lower case and removing special characters 
df['cleantext'] = df['emailtext'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
# Remove punctuation
df['cleantext'] = df['cleantext'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
# Remove numbers
df['cleantext'] = df['cleantext'].apply(lambda x: re.sub(r'\d+', '', x.lower()))

In [6]:
df.head()

Unnamed: 0,id,subject,emailtext,label,cleantext
0,738a2c78-e9b7-4886-8a1b-7af4c08f3906,RE: Yates Residence - Mechanical Contractor Me...,"Zac Stevenson, PEPRINCIPAL | BUILDING MEPC . ....",0,zac stevenson peprincipal building mepc o s...
1,23e4febf-364f-464a-9a3a-80f9d89d0e6b,Re: 6771 - VeLa - Level 4 Slab Penetration Plans,You tooRespectfullyMichael BentleyGeneral Fore...,0,you toorespectfullymichael bentleygeneral fore...
2,f0789ea4-0a5f-46c3-8adb-083766b63740,RE: FW: 13027A - 20240603 - Prl - Columbia + A...,You can update the two 12 ducts and adjust on ...,0,you can update the two ducts and adjust on p ...
3,09474b0b-bc7f-4a1f-ab95-0ed2c497468a,RE: West Zephyrhills Elementary - Pasco County...,"You can start with Area A. Proceeded by B,C,D ...",0,you can start with area a proceeded by bcd and...
4,b7438359-ce6c-42f7-bcfd-5690e875dcda,RE: MLW_0001_CRS_24410_ Connacht Stadium - Exi...,You can hold off on doing this for now.,0,you can hold off on doing this for now


#### Removing the stop words

In [7]:
# Removing stop words 
import nltk
from nltk.corpus import stopwords

In [8]:
# # Download stopwords if you haven't already
# nltk.download('stopwords')

In [9]:
# Define the list of stopwords
stop_words = set(stopwords.words('english'))

In [10]:
# Keeping the negative words 
negative_words = {"no", "not", "weren't", "couldn't", "needn't", "didn't", "wouldn't", "shouldn't"}

# Remove negative words from stop_words
filtered_stop_words = stop_words - negative_words

In [11]:
# # Save the stop words to a text file
# with open("data\stopwords.txt", "w") as file:
#     for word in stop_words:
#         file.write(word + "\n")

In [12]:
# Removing stop words
df['cleantext'] = df['cleantext'].apply(
    lambda x: ' '.join([word for word in x.lower().split() if word not in filtered_stop_words])
)

#### Applying Lemmetization to restore the words to its root

In [13]:
import spacy

In [14]:
# Load SpaCy language model
nlp = spacy.load('en_core_web_sm')

In [15]:
# Apply lemmatization using SpaCy
df['cleantext'] = df['cleantext'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))

In [16]:
df.head()

Unnamed: 0,id,subject,emailtext,label,cleantext
0,738a2c78-e9b7-4886-8a1b-7af4c08f3906,RE: Yates Residence - Mechanical Contractor Me...,"Zac Stevenson, PEPRINCIPAL | BUILDING MEPC . ....",0,zac stevenson peprincipal building mepc san an...
1,23e4febf-364f-464a-9a3a-80f9d89d0e6b,Re: 6771 - VeLa - Level 4 Slab Penetration Plans,You tooRespectfullyMichael BentleyGeneral Fore...,0,toorespectfullymichael bentleygeneral foremans...
2,f0789ea4-0a5f-46c3-8adb-083766b63740,RE: FW: 13027A - 20240603 - Prl - Columbia + A...,You can update the two 12 ducts and adjust on ...,0,update two duct adjust p need shop drawing do ...
3,09474b0b-bc7f-4a1f-ab95-0ed2c497468a,RE: West Zephyrhills Elementary - Pasco County...,"You can start with Area A. Proceeded by B,C,D ...",0,start area proceed bcd per late schedule field...
4,b7438359-ce6c-42f7-bcfd-5690e875dcda,RE: MLW_0001_CRS_24410_ Connacht Stadium - Exi...,You can hold off on doing this for now.,0,hold


#### Depedent vs Independent features

In [17]:
# Feature and label
X = df['cleantext']
y = df['label']

#### Feature Extraction
##### Using TFIDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
# Selecting 1000 most frequent words only
vectorizer = TfidfVectorizer(max_features=1500)

In [20]:
# Fit and transform the cleantext data into a TF-IDF feature matrix
X_tfidf = vectorizer.fit_transform(X)

print(X_tfidf.shape)

(5853, 1500)


In [21]:
# Get the feature names (words in the vocabulary)
feature_names = vectorizer.get_feature_names_out()

# # Display the words
# print(feature_names)

#### Dimension Reduction using PCA

In [41]:
# from sklearn.decomposition import PCA

In [42]:
# pca = PCA(n_components=100)  # Reduce to 100 principal components
# X_pca = pca.fit_transform(X_tfidf.toarray())

#### Spliting the data into train and test

In [43]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42, stratify=y)

# Check the shape of the training and test sets
print(X_train.shape, X_test.shape)
print(f'\nTrain label shape: {y_train.value_counts()}')
print(f'\nTest label shape: {y_test.value_counts()}')

(4097, 1500) (1756, 1500)

Train label shape: label
0    4059
1      38
Name: count, dtype: int64

Test label shape: label
0    1739
1      17
Name: count, dtype: int64


#### Handling the data imbalance using SMOTE

In [44]:
from imblearn.over_sampling import SMOTE

# SMOT Params
smot_params = {
    'sampling_strategy': 0.3,
    'k_neighbors': 10,
    'random_state': 42
}

# Unpack the smot_params dictionary using ** when initializing SMOTE
smote = SMOTE(**smot_params)

# Sampled the data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f'y_train_resampled {y_train_resampled.value_counts()}')

y_train_resampled label
0    4059
1    1217
Name: count, dtype: int64


In [45]:
# Calculate class imbalance ratio for scale_pos_weight
class_0_count, class_1_count = y_train.value_counts()
scale_pos_weight = ( class_0_count / class_1_count)
print(f"Scale pos weight: {scale_pos_weight}")

Scale pos weight: 106.8157894736842


In [61]:
# Importing XgBoost
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, fbeta_score, recall_score

In [106]:
class_weight = {0: 1.0, 1: 2000.0}

In [107]:
param_distributions = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'gamma': ['scale', 'auto'],  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    'degree': [2, 3, 4],  # Degree for the 'poly' kernel
    'class_weight': [class_weight, 'balanced', None]  # 'balanced' mode automatically adjusts class weights inversely proportional to class frequencies
}

In [108]:
# Initialize the SVM model
svm_model = SVC()

# Custom scoring metric using AUC-PR
scorer = make_scorer(recall_score)

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=param_distributions,
    n_iter=15,  # You can adjust this to control how many random combinations to try
    scoring=scorer,  # Use your desired scoring metric
    n_jobs=-1,  # Use all available cores
    cv=3,  # Number of cross-validation folds
    verbose=1,  # Controls verbosity: 0 (silent), 1 (progress), 2 (full output)
    random_state=42  # Ensures reproducibility
)

In [109]:
# Fit the grid search
random_search.fit(X_train_resampled, y_train_resampled)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


In [110]:
# Output the best parameters
print("Best parameters found: ", random_search.best_params_)

Best parameters found:  {'kernel': 'linear', 'gamma': 'auto', 'degree': 2, 'class_weight': None, 'C': 100}


In [111]:
cv_results_df = pd.DataFrame(random_search.cv_results_)
cv_results_df = cv_results_df.sort_values(by='rank_test_score', ascending=True)
cv_results_df.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_gamma,param_degree,param_class_weight,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.504124,0.071018,0.162414,0.012135,linear,auto,2,,100.0,"{'kernel': 'linear', 'gamma': 'auto', 'degree'...",1.0,1.0,1.0,1.0,0.0,1
1,0.510273,0.047036,0.165195,0.006015,linear,auto,4,,100.0,"{'kernel': 'linear', 'gamma': 'auto', 'degree'...",1.0,1.0,1.0,1.0,0.0,1
2,0.509454,0.039373,0.179576,0.013838,linear,scale,4,balanced,10.0,"{'kernel': 'linear', 'gamma': 'scale', 'degree...",1.0,1.0,1.0,1.0,0.0,1
3,0.531028,0.023996,0.174246,0.012649,linear,auto,4,"{0: 1.0, 1: 2000.0}",100.0,"{'kernel': 'linear', 'gamma': 'auto', 'degree'...",1.0,1.0,1.0,1.0,0.0,1
4,1.200449,0.056565,0.506493,0.05988,linear,auto,4,"{0: 1.0, 1: 2000.0}",0.1,"{'kernel': 'linear', 'gamma': 'auto', 'degree'...",1.0,1.0,1.0,1.0,0.0,1


In [112]:
# Use the best estimator to predict on the test data
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

In [113]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate performance
conf_matrix = confusion_matrix(y_test, y_pred)

In [114]:
# Evaluate the model (optional step)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1739
           1       0.06      0.06      0.06        17

    accuracy                           0.98      1756
   macro avg       0.53      0.53      0.53      1756
weighted avg       0.98      0.98      0.98      1756

[[1724   15]
 [  16    1]]
