In [18]:
import pandas as pd
import re
import pickle
import nltk
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


In [19]:

# Download required NLP resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab')
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAJESH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAJESH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\RAJESH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RAJESH\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:

# Load dataset
dataset_path = "../datasets/mess_complaints.csv"
df = pd.read_csv(dataset_path)

# Remove NaN values
df.dropna(subset=["Description of the Issue", "Type of Complaint"], inplace=True)

In [21]:
# Encode complaint categories
label_encoder = LabelEncoder()
df["Complaint Category"] = label_encoder.fit_transform(df["Type of Complaint"])

In [22]:

# Initialize NLP tools
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [23]:
# Text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase conversion
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    words = word_tokenize(text)  # Tokenization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & Stopword removal
    return " ".join(words)


In [24]:
# Apply preprocessing
df["Cleaned Description"] = df["Description of the Issue"].apply(preprocess_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["Cleaned Description"], df["Complaint Category"], test_size=0.2, random_state=42
)

In [25]:

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(
    max_features=7000, ngram_range=(1, 2), sublinear_tf=True, stop_words="english"
)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [26]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(solver="liblinear", C=1.0),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel="linear", C=1.0),
    "Naive Bayes": MultinomialNB(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

# Train & Evaluate models
best_model = None
best_accuracy = 0
model_performance = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    start_time = time.time()
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    end_time = time.time()
    
    model_performance[model_name] = {
        "accuracy": accuracy,
        "training_time": end_time - start_time
    }
    
    print(f"{model_name} Accuracy: {accuracy:.4f} | Training Time: {end_time - start_time:.2f} seconds")  
    
    # Save the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

Training Logistic Regression...
Logistic Regression Accuracy: 0.9950 | Training Time: 0.01 seconds
Training Random Forest...
Random Forest Accuracy: 0.9950 | Training Time: 0.16 seconds
Training SVM...
SVM Accuracy: 0.9950 | Training Time: 0.03 seconds
Training Naive Bayes...
Naive Bayes Accuracy: 0.9950 | Training Time: 0.00 seconds
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.9950 | Training Time: 0.20 seconds


In [27]:
# Save the best-performing model
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

print("\n‚úÖ Model training complete! Best model saved.")
print(f"üèÜ Best Model: {best_model} with Accuracy: {best_accuracy:.4f}")

# Print all model performances
print("\nüîç Model Performance Summary:")
for model_name, metrics in model_performance.items():
    print(f"{model_name}: Accuracy = {metrics['accuracy']:.4f}, Training Time = {metrics['training_time']:.2f} sec")



‚úÖ Model training complete! Best model saved.
üèÜ Best Model: LogisticRegression(solver='liblinear') with Accuracy: 0.9950

üîç Model Performance Summary:
Logistic Regression: Accuracy = 0.9950, Training Time = 0.01 sec
Random Forest: Accuracy = 0.9950, Training Time = 0.16 sec
SVM: Accuracy = 0.9950, Training Time = 0.03 sec
Naive Bayes: Accuracy = 0.9950, Training Time = 0.00 sec
XGBoost: Accuracy = 0.9950, Training Time = 0.20 sec
