In [1]:
import pandas as pd

# Load the dataset
file_path = '/content/complaints_category_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Complaints  483 non-null    object
 1   Categories  483 non-null    object
dtypes: object(2)
memory usage: 7.8+ KB


(                                          Complaints      Categories
 0                                                NaN             NaN
 1  I need help fixing the software issue on my de...  Account Issues
 2          The app keeps crashing, I need assistance  Account Issues
 3  Can you help me resolve the connectivity issue...  Account Issues
 4   I'm unable to log into my account, please assist  Account Issues,
 None)

In [2]:
# Drop rows with missing values
data_cleaned = data.dropna()

# Check the cleaned data
data_cleaned.info(), data_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
Index: 483 entries, 1 to 491
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Complaints  483 non-null    object
 1   Categories  483 non-null    object
dtypes: object(2)
memory usage: 11.3+ KB


(None,
                                           Complaints      Categories
 1  I need help fixing the software issue on my de...  Account Issues
 2          The app keeps crashing, I need assistance  Account Issues
 3  Can you help me resolve the connectivity issue...  Account Issues
 4   I'm unable to log into my account, please assist  Account Issues
 5  The website is not loading properly, can you i...  Account Issues)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import re

# Text preprocessing: Remove special characters and tokenize
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    return text

# Apply preprocessing to the 'Complaints' column
data_cleaned['Complaints'] = data_cleaned['Complaints'].apply(preprocess_text)

# Split the data into features (X) and target (y)
X = data_cleaned['Complaints']
y = data_cleaned['Categories']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a pipeline with TF-IDF and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),  # TF-IDF Vectorizer
    ('clf', LogisticRegression())  # Logistic Regression Classifier
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['Complaints'] = data_cleaned['Complaints'].apply(preprocess_text)


(0.7835051546391752,
 '                 precision    recall  f1-score   support\n\n Account Issues       0.71      1.00      0.83        12\n        Billing       0.85      1.00      0.92        22\nDelivery Issues       0.86      1.00      0.92         6\n  Miscellaneous       1.00      0.17      0.29        12\n   Order Issues       0.33      0.20      0.25         5\n Product Issues       0.76      0.89      0.82        18\n        Quality       0.67      0.40      0.50         5\n         Refund       0.73      0.80      0.76        10\n       Warranty       1.00      1.00      1.00         7\n\n       accuracy                           0.78        97\n      macro avg       0.77      0.72      0.70        97\n   weighted avg       0.80      0.78      0.74        97\n')

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define a pipeline with TF-IDF and Random Forest Classifier
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', RandomForestClassifier(random_state=42))
])

# Set up the parameter grid for hyperparameter tuning
param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

# Set up the GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline_rf, param_grid, cv=3, n_jobs=-1, verbose=2)

# Train the model with grid search
grid_search.fit(X_train, y_train)

# Get the best model after tuning
best_model = grid_search.best_estimator_

# Predict on the test set using the best model
y_pred_rf = best_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf)

accuracy_rf, report_rf, grid_search.best_params_


Fitting 3 folds for each of 108 candidates, totalling 324 fits


(0.8247422680412371,
 '                 precision    recall  f1-score   support\n\n Account Issues       0.80      1.00      0.89        12\n        Billing       0.84      0.95      0.89        22\nDelivery Issues       0.71      0.83      0.77         6\n  Miscellaneous       1.00      0.42      0.59        12\n   Order Issues       0.80      0.80      0.80         5\n Product Issues       0.89      0.89      0.89        18\n        Quality       0.67      0.40      0.50         5\n         Refund       0.67      0.80      0.73        10\n       Warranty       1.00      1.00      1.00         7\n\n       accuracy                           0.82        97\n      macro avg       0.82      0.79      0.78        97\n   weighted avg       0.84      0.82      0.81        97\n',
 {'clf__max_depth': None,
  'clf__min_samples_leaf': 1,
  'clf__min_samples_split': 2,
  'clf__n_estimators': 100})

In [7]:
import pickle
# Assuming 'model' is your trained model
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [8]:
with open('best_model.pkl', 'rb') as file:
    model = pickle.load(file)  # Ensure no errors here

In [5]:
def predict_category(inquiry, model=best_model):
  """Predicts the category of a user's inquiry using the trained model."""
  preprocessed_inquiry = preprocess_text(inquiry)
  predicted_category = model.predict([preprocessed_inquiry])[0]
  return predicted_category


# Get user input
user_inquiry = input("Please enter your inquiry: ")

# Predict the category
predicted_category = predict_category(user_inquiry)

# Output the predicted category
print("Predicted Category:", predicted_category)

Please enter your inquiry: I want to cancel my subscription, how do I do that?
Predicted Category: Miscellaneous
