In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import joblib
from datetime import datetime
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Set seaborn style for better visuals
sns.set(style="whitegrid")

# Load the dataset
df = pd.read_csv('Customer_support_data.csv')

# Data Preprocessing
# Convert date-time columns to datetime
df['Issue_reported at'] = pd.to_datetime(df['Issue_reported at'], format='%d-%m-%Y %H:%M')
df['issue_responded'] = pd.to_datetime(df['issue_responded'], format='%d-%m-%Y %H:%M')

# Calculate response time in minutes
df['response_time_minutes'] = (df['issue_responded'] - df['Issue_reported at']).dt.total_seconds() / 60

# Handle missing values
df['Customer Remarks'] = df['Customer Remarks'].fillna('No comment')

# Drop irrelevant or high-cardinality columns
columns_to_drop = ['Unique id', 'Order_id', 'order_date_time', 'Survey_response_Date',
                   'Customer_City', 'Product_category', 'Item_price', 'connected_handling_time',
                   'Supervisor', 'Manager']
df = df.drop(columns=columns_to_drop)

# Clean text data
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['Customer Remarks'] = df['Customer Remarks'].apply(clean_text)

# Plot class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='CSAT Score', data=df)
plt.title('Class Distribution of CSAT Scores')
plt.xlabel('CSAT Score')
plt.ylabel('Count')
plt.savefig('class_distribution.png')
plt.close()

# Define features and target
X = df.drop(columns=['CSAT Score'])
y = df['CSAT Score']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define preprocessing steps
categorical_cols = ['channel_name', 'category', 'Sub-category', 'Agent_name', 'Tenure Bucket', 'Agent Shift']
numerical_cols = ['response_time_minutes']
text_col = 'Customer Remarks'

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_cols),
        ('text', TfidfVectorizer(max_features=500, stop_words='english'), text_col)
    ])

# Create model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Plot confusion matrix heatmap
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[1, 2, 3, 4, 5], yticklabels=[1, 2, 3, 4, 5])
plt.title('Confusion Matrix')
plt.xlabel('Predicted CSAT Score')
plt.ylabel('True CSAT Score')
plt.savefig('confusion_matrix.png')
plt.close()

# Plot classification report metrics
report = classification_report(y_test, y_pred, output_dict=True)
metrics_df = pd.DataFrame({
    'Precision': [report[str(i)]['precision'] for i in range(1, 6)],
    'Recall': [report[str(i)]['recall'] for i in range(1, 6)],
    'F1-Score': [report[str(i)]['f1-score'] for i in range(1, 6)]
}, index=[1, 2, 3, 4, 5])

plt.figure(figsize=(10, 6))
metrics_df.plot(kind='bar')
plt.title('Classification Metrics by CSAT Score')
plt.xlabel('CSAT Score')
plt.ylabel('Score')
plt.legend(title='Metric')
plt.savefig('classification_metrics.png')
plt.close()

# Plot feature importance
feature_importances = model.named_steps['classifier'].feature_importances_
feature_names = (
    numerical_cols +
    model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols).tolist() +
    model.named_steps['preprocessor'].named_transformers_['text'].get_feature_names_out().tolist()
)

# Create DataFrame and select top 10 features
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importance_df = importance_df.sort_values(by='importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=importance_df)
plt.title('Top 10 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.savefig('feature_importance.png')
plt.close()

# Save the model
joblib.dump(model, 'csat_prediction_model.pkl')

# Example: Predict on new data
new_data = pd.DataFrame({
    'channel_name': ['Inbound'],
    'category': ['Returns'],
    'Sub-category': ['Reverse Pickup Enquiry'],
    'Customer Remarks': ['Good service, quick response'],
    'Agent_name': ['John Smith'],
    'Tenure Bucket': ['>90'],
    'Agent Shift': ['Morning'],
    'Issue_reported at': ['2023-08-01 10:00'],
    'issue_responded': ['2023-08-01 10:15']
})

# Preprocess new data
new_data['Issue_reported at'] = pd.to_datetime(new_data['Issue_reported at'])
new_data['issue_responded'] = pd.to_datetime(new_data['issue_responded'])
new_data['response_time_minutes'] = (new_data['issue_responded'] - new_data['Issue_reported at']).dt.total_seconds() / 60
new_data['Customer Remarks'] = new_data['Customer Remarks'].apply(clean_text)

# Predict
prediction = model.predict(new_data)
print("\nPredicted CSAT Score for new data:", prediction[0])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification Report:
              precision    recall  f1-score   support

           1       0.51      0.30      0.38      2246
           2       0.05      0.01      0.02       256
           3       0.05      0.01      0.02       512
           4       0.16      0.05      0.07      2244
           5       0.73      0.92      0.81     11924

    accuracy                           0.68     17182
   macro avg       0.30      0.26      0.26     17182
weighted avg       0.60      0.68      0.62     17182


Confusion Matrix:
[[  685     8    11    72  1470]
 [   32     3     4     9   208]
 [   59     5     7    21   420]
 [  119    10    25   102  1988]
 [  443    31    81   433 10936]]

Predicted CSAT Score for new data: 5


<Figure size 1000x600 with 0 Axes>