In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../Data/customer_support_tickets.csv')

In [3]:
df.head()

Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Social media,2023-06-01 12:15:36,,
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Chat,2023-06-01 16:45:38,,
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3.0
3,4,Christina Dillon,bradleyolson@example.org,27,Female,Microsoft Office,2020-11-13,Billing inquiry,Account access,I'm having an issue with the {product_purchase...,Closed,Try capital clearly never color toward story.,Low,Social media,2023-06-01 07:29:40,2023-06-01 01:57:40,3.0
4,5,Alexander Carroll,bradleymark@example.com,67,Female,Autodesk AutoCAD,2020-02-04,Billing inquiry,Data loss,I'm having an issue with the {product_purchase...,Closed,West decision evidence bit.,Low,Email,2023-06-01 00:12:42,2023-06-01 19:53:42,1.0


# Data Understanding & Cleaning

In [4]:
df.shape

(8469, 17)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8469 entries, 0 to 8468
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ticket ID                     8469 non-null   int64  
 1   Customer Name                 8469 non-null   object 
 2   Customer Email                8469 non-null   object 
 3   Customer Age                  8469 non-null   int64  
 4   Customer Gender               8469 non-null   object 
 5   Product Purchased             8469 non-null   object 
 6   Date of Purchase              8469 non-null   object 
 7   Ticket Type                   8469 non-null   object 
 8   Ticket Subject                8469 non-null   object 
 9   Ticket Description            8469 non-null   object 
 10  Ticket Status                 8469 non-null   object 
 11  Resolution                    2769 non-null   object 
 12  Ticket Priority               8469 non-null   object 
 13  Tic

In [6]:
df.describe()

Unnamed: 0,Ticket ID,Customer Age,Customer Satisfaction Rating
count,8469.0,8469.0,2769.0
mean,4235.0,44.026804,2.991333
std,2444.934048,15.296112,1.407016
min,1.0,18.0,1.0
25%,2118.0,31.0,2.0
50%,4235.0,44.0,3.0
75%,6352.0,57.0,4.0
max,8469.0,70.0,5.0


In [7]:
# Drop 2 columns because he has containe approx 70% null values and drop unnecessary columns
df = df.drop(['Ticket ID', 'Customer Name', 'Customer Email', 'Ticket Description', 'Resolution', 'Time to Resolution', 'Ticket Priority','Ticket Type','Ticket Channel', 'Ticket Status'], axis = 1)

In [8]:
# Drop all null rows in Customer Satisfaction Rating
df = df[df['Customer Satisfaction Rating'].notnull()]

In [9]:
df['Date of Purchase'] = pd.to_datetime(df['Date of Purchase'])
df['First Response Time'] = pd.to_datetime(df['First Response Time'])

In [10]:
df

Unnamed: 0,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Subject,First Response Time,Customer Satisfaction Rating
2,48,Other,Dell XPS,2020-07-14,Network problem,2023-06-01 11:14:38,3.0
3,27,Female,Microsoft Office,2020-11-13,Account access,2023-06-01 07:29:40,3.0
4,67,Female,Autodesk AutoCAD,2020-02-04,Data loss,2023-06-01 00:12:42,1.0
10,48,Male,Nintendo Switch,2021-01-19,Data loss,2023-06-01 17:46:49,1.0
11,51,Male,Microsoft Xbox Controller,2021-10-24,Software bug,2023-06-01 12:05:51,1.0
...,...,...,...,...,...,...,...
8452,62,Female,MacBook Pro,2020-03-29,Display issue,2023-06-01 14:14:05,3.0
8453,61,Other,iPhone,2020-06-20,Peripheral compatibility,2023-06-01 12:56:06,5.0
8455,54,Other,Dyson Vacuum Cleaner,2021-02-02,Refund request,2023-06-01 14:15:07,1.0
8466,57,Female,GoPro Action Camera,2021-08-17,Account access,2023-06-01 09:44:22,3.0


# Feature Engineering

In [11]:
# Feature Engineering – Days Since Purchase
today = pd.Timestamp.today()

df['Days_Since_Purchase'] = (today - df['Date of Purchase']).dt.days

# Drop original date columns
df.drop(columns=['Date of Purchase', 'First Response Time'], inplace=True)

In [12]:
# Categorize satisfaction rating
def categorize(rating):
    if rating <= 2:
        return 'Low'
    elif rating == 3:
        return 'Medium'
    else:
        return 'High'


# Apply to data
df['Satisfaction_Level'] = df['Customer Satisfaction Rating'].apply(categorize)


In [13]:
df

Unnamed: 0,Customer Age,Customer Gender,Product Purchased,Ticket Subject,Customer Satisfaction Rating,Days_Since_Purchase,Satisfaction_Level
2,48,Other,Dell XPS,Network problem,3.0,1849,Medium
3,27,Female,Microsoft Office,Account access,3.0,1727,Medium
4,67,Female,Autodesk AutoCAD,Data loss,1.0,2010,Low
10,48,Male,Nintendo Switch,Data loss,1.0,1660,Low
11,51,Male,Microsoft Xbox Controller,Software bug,1.0,1382,Low
...,...,...,...,...,...,...,...
8452,62,Female,MacBook Pro,Display issue,3.0,1956,Medium
8453,61,Other,iPhone,Peripheral compatibility,5.0,1873,High
8455,54,Other,Dyson Vacuum Cleaner,Refund request,1.0,1646,Low
8466,57,Female,GoPro Action Camera,Account access,3.0,1450,Medium


In [14]:
df.to_csv('pipline.csv')

# scaling

In [15]:
num_col = df.select_dtypes(['int', 'float'])
num_col = num_col.drop(['Customer Satisfaction Rating'], axis = 1)
target_col = df['Customer Satisfaction Rating']

In [16]:
# scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_col = scaler.fit_transform(num_col)
num_df = pd.DataFrame(num_col, columns = ['Customer Age', 'Days_Since_Purchase'])

# Encoding

In [17]:
from sklearn.preprocessing import LabelEncoder
label_encoders = pd.DataFrame()
cat_col = df.select_dtypes('object')
cat_col = df.drop(['Customer Age', 'Days_Since_Purchase', 'Customer Satisfaction Rating'], axis = 1)

for col in cat_col:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = df[col]

# Reset index

In [18]:
num_df =  num_df.reset_index(drop = True)
label_encoders =   label_encoders.reset_index(drop = True)
target_col =-   target_col.reset_index(drop = True)

# Concate all data frames

In [19]:
final_df = pd.concat([num_df, label_encoders, target_col], axis = 1)

In [20]:
final_df

Unnamed: 0,Customer Age,Days_Since_Purchase,Customer Gender,Product Purchased,Ticket Subject,Satisfaction_Level,Customer Satisfaction Rating
0,0.241319,0.793096,2,10,8,2,-3.0
1,-1.140919,0.216145,0,25,0,2,-3.0
2,1.491916,1.554482,0,5,3,1,-1.0
3,0.241319,-0.100705,1,30,3,1,-1.0
4,0.438782,-1.415396,1,27,15,1,-1.0
...,...,...,...,...,...,...,...
2764,1.162812,1.299110,0,24,5,2,-3.0
2765,1.096991,0.906594,2,41,10,0,-5.0
2766,0.636245,-0.166912,2,11,14,1,-1.0
2767,0.833707,-1.093817,0,15,0,2,-3.0


In [21]:
# Save the data set 

final_df.to_csv('clean_data.csv')