In [5]:
# 02_preprocessing.ipynb
# Data Preprocessing and Feature Engineering for Customer Support Ticket Dataset

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Load dataset
df = pd.read_csv('../data/customer_support_tickets.csv')
print("Original Shape:", df.shape)

# Handle missing values
# Drop rows where 'Time to Resolution' and 'Customer Satisfaction Rating' are missing (open tickets)
df = df.dropna(subset=['Time to Resolution', 'Customer Satisfaction Rating'])
print("Shape after dropping missing resolution/satisfaction:", df.shape)

# Convert 'First Response Time' and 'Date of Purchase' to datetime
df['First Response Time'] = pd.to_datetime(df['First Response Time'])
df['Date of Purchase'] = pd.to_datetime(df['Date of Purchase'])

# Impute 'First Response Time' with median time difference from 'Date of Purchase'
df['First Response Time Delta'] = (df['First Response Time'] - df['Date of Purchase']).dt.total_seconds() / 3600  # Convert to hours
median_response_time = df['First Response Time Delta'].median()
df['First Response Time Delta'] = df['First Response Time Delta'].fillna(median_response_time)
print("Median First Response Time (hours):", median_response_time)

# Fill 'Resolution' with placeholder for NLP tasks
df['Resolution'] = df['Resolution'].fillna('No resolution provided')

# Encode categorical variables
le_type = LabelEncoder()
le_channel = LabelEncoder()
le_product = LabelEncoder()
le_priority = LabelEncoder()

df['Ticket Type Enc'] = le_type.fit_transform(df['Ticket Type'])
df['Ticket Channel Enc'] = le_channel.fit_transform(df['Ticket Channel'])
df['Product Enc'] = le_product.fit_transform(df['Product Purchased'])
df['Ticket Priority Enc'] = le_priority.fit_transform(df['Ticket Priority'])

# Extract date features
df['Purchase Day'] = df['Date of Purchase'].dt.dayofweek
df['Purchase Month'] = df['Date of Purchase'].dt.month

# NLP features for Ticket Description
vectorizer = TfidfVectorizer(max_features=100)
nlp_features = vectorizer.fit_transform(df['Ticket Description']).toarray()
nlp_cols = [f'text_{i}' for i in range(nlp_features.shape[1])]
df_nlp = pd.DataFrame(nlp_features, columns=nlp_cols, index=df.index)
df = pd.concat([df, df_nlp], axis=1)

# Save processed data and encoders
df.to_csv('../data/processed/processed_tickets.csv', index=False)
with open('../models/label_encoder_type.pkl', 'wb') as f:
    pickle.dump(le_type, f)
with open('../models/label_encoder_channel.pkl', 'wb') as f:
    pickle.dump(le_channel, f)
with open('../models/label_encoder_product.pkl', 'wb') as f:
    pickle.dump(le_product, f)
with open('../models/label_encoder_priority.pkl', 'wb') as f:
    pickle.dump(le_priority, f)
with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Preprocessing complete. Processed data saved to '../data/processed/processed_tickets.csv'.")

Original Shape: (8469, 17)
Shape after dropping missing resolution/satisfaction: (2769, 17)
Median First Response Time (hours): 21303.618055555555
Preprocessing complete. Processed data saved to '../data/processed/processed_tickets.csv'.
