In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/customer_inquiries.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1113 entries, 0 to 1112
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Complaint  1113 non-null   object
 1   Category   1113 non-null   object
dtypes: object(2)
memory usage: 17.5+ KB


(                                           Complaint Category
 0  I was overcharged on my last bill; the amount ...  Billing
 1  The discount I applied didn’t reflect on my in...  Billing
 2  Why am I being charged twice for the same prod...  Billing
 3  My billing statement has hidden fees I didn’t ...  Billing
 4  I need an explanation for the unexpected incre...  Billing,
 None)

In [None]:
# Drop rows with missing values
data_cleaned = data.dropna()

# Check the cleaned data
data_cleaned.info(), data_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1113 entries, 0 to 1112
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Complaint  1113 non-null   object
 1   Category   1113 non-null   object
dtypes: object(2)
memory usage: 17.5+ KB


(None,
                                            Complaint Category
 0  I was overcharged on my last bill; the amount ...  Billing
 1  The discount I applied didn’t reflect on my in...  Billing
 2  Why am I being charged twice for the same prod...  Billing
 3  My billing statement has hidden fees I didn’t ...  Billing
 4  I need an explanation for the unexpected incre...  Billing)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text preprocessing: Remove special characters, tokenize, remove stopwords, and apply lemmatization
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Handle common misspellings or typos (e.g., 'wont' -> 'will not')
    text = re.sub(r'\bim\b', 'i am', text)
    text = re.sub(r'\bwont\b', 'will not', text)
    text = re.sub(r'\bdont\b', 'do not', text)
    text = re.sub(r'\bcant\b', 'cannot', text)
    text = re.sub(r'\bdoesnt\b', 'does not', text)

    # Remove repeated characters (e.g., "loooove" to "love")
    text = re.sub(r'(.)\1{2,}', r'\1', text)

    # Tokenization, stopword removal, and lemmatization
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2]

    # Rejoin words into a clean text
    return ' '.join(words)

# Apply preprocessing to the 'Complaint' column
data_cleaned['Complaint'] = data_cleaned['Complaint'].apply(preprocess_text)

# Split the data into features (X) and target (y)
X = data_cleaned['Complaint']
y = data_cleaned['Category']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

TfidfVectorizer(max_features=15000, ngram_range=(1, 4), min_df=2, max_df=0.9, sublinear_tf=True)

TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 3),
    analyzer='word',
    strip_accents='unicode',
    lowercase=True,
    stop_words='english'
)

# Combine word and character n-grams
TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 3),
    analyzer='char_wb',  # Character n-grams within word boundaries
)

# Build a pipeline with TF-IDF and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=15000, ngram_range=(1, 3), min_df=2, max_df=0.9,smooth_idf=True, sublinear_tf=True)),  # TF-IDF Vectorizer with bigrams
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))  # Logistic Regression with increased max iterations
])

from sklearn.model_selection import GridSearchCV

param_grid = {
    'tfidf__max_features': [10000, 15000, 20000],
    'clf__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(0.8709677419354839,
 '                   precision    recall  f1-score   support\n\n   Account Issues       0.89      0.94      0.91        33\n          Billing       1.00      0.89      0.94        36\n  Delivery Issues       0.87      0.81      0.84        32\n    Miscellaneous       0.93      0.65      0.76        20\n     Order Issues       0.75      0.90      0.82        30\n   Product Issues       0.74      0.71      0.73        28\n          Quality       0.86      0.86      0.86        28\n           Refund       0.85      1.00      0.92        17\nTechnical Support       0.90      0.93      0.92        30\n         Warranty       0.96      1.00      0.98        25\n\n         accuracy                           0.87       279\n        macro avg       0.87      0.87      0.87       279\n     weighted avg       0.88      0.87      0.87       279\n')

In [None]:
import pickle
# Assuming 'model' is your trained model
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [None]:
with open('best_model.pkl', 'rb') as file:
    model = pickle.load(file)  # Ensure no errors here

In [None]:
def predict_category(inquiry, model=best_model):
  """Predicts the category of a user's inquiry using the trained model."""
  preprocessed_inquiry = preprocess_text(inquiry)
  predicted_category = model.predict([preprocessed_inquiry])[0]
  return predicted_category


# Get user input
user_inquiry = input("Please enter your inquiry: ")

# Predict the category
predicted_category = predict_category(user_inquiry)

# Output the predicted category
print("Predicted Category:", predicted_category)

Please enter your inquiry: my subscription isues
Predicted Category: Billing
