In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

In [30]:
calls = pd.read_csv('/content/drive/MyDrive/UnitedData/callsf0d4f5a.csv',  engine='python',on_bad_lines='skip')
reason = pd.read_csv('/content/drive/MyDrive/UnitedData/reason18315ff.csv')
sentiment = pd.read_csv('/content/drive/MyDrive/UnitedData/sentiment_statisticscc1e57a.csv',  engine='python',on_bad_lines='skip')
customers = pd.read_csv('/content/drive/MyDrive/UnitedData/customers2afd6ea.csv',  engine='python',on_bad_lines='skip')
test = pd.read_csv('/content/drive/MyDrive/UnitedData/testbc7185d.csv')

#  **Data Preprocessing**

In [None]:
## Coverting strings to Date time format

calls['call_start_datetime'] = pd.to_datetime(calls['call_start_datetime'], format='%m/%d/%Y %H:%M')
calls['agent_assigned_datetime'] = pd.to_datetime(calls['agent_assigned_datetime'], format='%m/%d/%Y %H:%M')
calls['call_end_datetime'] = pd.to_datetime(calls['call_end_datetime'], format='%m/%d/%Y %H:%M')

calls['ST'] = calls['agent_assigned_datetime'] - calls['call_start_datetime']
calls['HT'] = calls['call_end_datetime'] - calls['agent_assigned_datetime']

calls.head(10)

Unnamed: 0,call_id,customer_id,agent_id,call_start_datetime,agent_assigned_datetime,call_end_datetime,call_transcript,ST,HT
0,4667960400,2033123310,963118,2024-07-31 23:56:00,2024-08-01 00:03:00,2024-08-01 00:34:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:07:00,0 days 00:31:00
1,1122072124,8186702651,519057,2024-08-01 00:03:00,2024-08-01 00:06:00,2024-08-01 00:18:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:03:00,0 days 00:12:00
2,6834291559,2416856629,158319,2024-07-31 23:59:00,2024-08-01 00:07:00,2024-08-01 00:26:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:19:00
3,2266439882,1154544516,488324,2024-08-01 00:05:00,2024-08-01 00:10:00,2024-08-01 00:17:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:05:00,0 days 00:07:00
4,1211603231,5214456437,721730,2024-08-01 00:04:00,2024-08-01 00:14:00,2024-08-01 00:23:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:10:00,0 days 00:09:00
5,5297766997,5590154991,817160,2024-08-01 00:11:00,2024-08-01 00:16:00,2024-08-01 00:40:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:05:00,0 days 00:24:00
6,324593040,6774865122,519057,2024-08-01 00:08:00,2024-08-01 00:21:00,2024-08-01 00:34:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:13:00,0 days 00:13:00
7,8902603117,7974326984,488324,2024-08-01 00:13:00,2024-08-01 00:21:00,2024-08-01 00:29:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:08:00
8,7222687732,8023417234,957331,2024-08-01 00:14:00,2024-08-01 00:22:00,2024-08-01 00:35:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:13:00
9,4113684017,1528835057,158319,2024-08-01 00:20:00,2024-08-01 00:28:00,2024-08-01 00:37:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:09:00


# Combining all tables

In [None]:
# calls.head(10)
# reason.head(10)
# sentiment.head(10)
customers.head(10)
combined = pd.merge(calls, reason, on='call_id', how='left')
combined = pd.merge(combined, sentiment, on='call_id', how='left')
combined = pd.merge(combined, customers, on='customer_id', how='left')
combined.head(10)

Unnamed: 0,call_id,customer_id,agent_id_x,call_start_datetime,agent_assigned_datetime,call_end_datetime,call_transcript,ST,HT,primary_call_reason,agent_id_y,agent_tone,customer_tone,average_sentiment,silence_percent_average,customer_name,elite_level_code
0,4667960400,2033123310,963118,2024-07-31 23:56:00,2024-08-01 00:03:00,2024-08-01 00:34:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:07:00,0 days 00:31:00,Voluntary Cancel,963118,neutral,angry,-0.04,0.39,Matthew Foster,4.0
1,1122072124,8186702651,519057,2024-08-01 00:03:00,2024-08-01 00:06:00,2024-08-01 00:18:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:03:00,0 days 00:12:00,Booking,519057,calm,neutral,0.02,0.35,Tammy Walters,
2,6834291559,2416856629,158319,2024-07-31 23:59:00,2024-08-01 00:07:00,2024-08-01 00:26:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:19:00,IRROPS,158319,neutral,polite,-0.13,0.32,Jeffery Dixon,
3,2266439882,1154544516,488324,2024-08-01 00:05:00,2024-08-01 00:10:00,2024-08-01 00:17:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:05:00,0 days 00:07:00,Upgrade,488324,neutral,frustrated,-0.2,0.2,David Wilkins,2.0
4,1211603231,5214456437,721730,2024-08-01 00:04:00,2024-08-01 00:14:00,2024-08-01 00:23:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:10:00,0 days 00:09:00,Seating,721730,neutral,polite,-0.05,0.35,Elizabeth Daniels,0.0
5,5297766997,5590154991,817160,2024-08-01 00:11:00,2024-08-01 00:16:00,2024-08-01 00:40:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:05:00,0 days 00:24:00,Mileage Plus,817160,calm,calm,0.1,0.11,Emily Alexander,5.0
6,324593040,6774865122,519057,2024-08-01 00:08:00,2024-08-01 00:21:00,2024-08-01 00:34:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:13:00,0 days 00:13:00,Checkout,519057,neutral,frustrated,0.0,0.25,James Lawrence,
7,8902603117,7974326984,488324,2024-08-01 00:13:00,2024-08-01 00:21:00,2024-08-01 00:29:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:08:00,Mileage Plus,488324,calm,angry,0.11,0.35,Megan Craig,2.0
8,7222687732,8023417234,957331,2024-08-01 00:14:00,2024-08-01 00:22:00,2024-08-01 00:35:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:13:00,Mileage Plus,957331,calm,neutral,0.06,0.54,Alexa Martinez,1.0
9,4113684017,1528835057,158319,2024-08-01 00:20:00,2024-08-01 00:28:00,2024-08-01 00:37:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:09:00,IRROPS,158319,neutral,angry,-0.19,0.29,Jonathan Parker,


In [None]:
relevant_vars = ['call_start_datetime','call_transcript','ST','HT','agent_tone','customer_tone','average_sentiment','silence_percent_average','elite_level_code']
regression_data = combined[relevant_vars]
regression_data['HT_seconds'] = regression_data['HT'].dt.total_seconds()
regression_data['ST_seconds'] = regression_data['ST'].dt.total_seconds()
regression_data['customer_tone'] = regression_data['customer_tone'].map({'angry': -2,'frustrated': -1, 'neutral': 0, 'calm': 1, 'polite': 2})
regression_data['agent_tone'] = regression_data['agent_tone'].map({'angry': -2,'frustrated': -1, 'neutral': 0, 'calm': 1, 'polite': 2})
regression_data.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regression_data['HT_seconds'] = regression_data['HT'].dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regression_data['ST_seconds'] = regression_data['ST'].dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regression_data['customer_tone'] = regression_data['custome

Unnamed: 0,call_start_datetime,call_transcript,ST,HT,agent_tone,customer_tone,average_sentiment,silence_percent_average,elite_level_code,HT_seconds,ST_seconds
0,2024-07-31 23:56:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:07:00,0 days 00:31:00,0.0,-2,-0.04,0.39,4.0,1860.0,420.0
1,2024-08-01 00:03:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:03:00,0 days 00:12:00,1.0,0,0.02,0.35,,720.0,180.0
2,2024-07-31 23:59:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:19:00,0.0,2,-0.13,0.32,,1140.0,480.0
3,2024-08-01 00:05:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:05:00,0 days 00:07:00,0.0,-1,-0.2,0.2,2.0,420.0,300.0
4,2024-08-01 00:04:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:10:00,0 days 00:09:00,0.0,2,-0.05,0.35,0.0,540.0,600.0
5,2024-08-01 00:11:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:05:00,0 days 00:24:00,1.0,1,0.1,0.11,5.0,1440.0,300.0
6,2024-08-01 00:08:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:13:00,0 days 00:13:00,0.0,-1,0.0,0.25,,780.0,780.0
7,2024-08-01 00:13:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:08:00,1.0,-2,0.11,0.35,2.0,480.0,480.0
8,2024-08-01 00:14:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:13:00,1.0,0,0.06,0.54,1.0,780.0,480.0
9,2024-08-01 00:20:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:09:00,0.0,-2,-0.19,0.29,,540.0,480.0


# **Defining Transformers**

**TimeFeaturesExtractor**: This transformer extracts time-based features from a column containing call start datetimes (call_start_datetime). It creates:



*  call_day: The day of the month when the call started.
*  call_hour: The hour of the day when the call started.
*   call_day_of_week: The day of the week (0 = Monday, 6 = Sunday).
*   is_weekend: A flag (1 if the call was made on the weekend, 0 otherwise).

**RatioFeaturesExtractor**: This transformer creates a ratio feature for HT_seconds (handle time) and ST_seconds (service time). It handles division by zero and infinite values by replacing them with NaN and then filling NaN with 0.

ht_st_ratio: The ratio of handle time (HT_seconds) to service time (ST_seconds).

**PrimaryCallReasonCleaner**: This transformer cleans and standardizes the primary_call_reason column. It:

* Strips leading/trailing whitespaces.
* Replaces multiple spaces with a single space.
* Applies custom replacements for specific strings (e.g., replaces & with and, maps np.nan to "Unknown").

**ToneEncoder**: This transformer encodes the emotional tone of both the customer and the agent. It maps tone categories to numerical values for model usage. The tone is represented by the following:

* angry: -2
* frustrated: -1
* neutral: 0
* calm: 1
* polite: 2
* NaN values are mapped to 0.

customer_tone: Encoded tone of the customer.
agent_tone: Encoded tone of the agent.



In [None]:
class TimeFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Extract time-based features from 'call_start_datetime'
        X_copy = X.copy()
        X_copy['call_start_datetime'] = pd.to_datetime(X_copy['call_start_datetime'], errors='coerce')
        X_copy['call_day'] = X_copy['call_start_datetime'].dt.day.fillna(-1).astype(int)
        X_copy['call_hour'] = X_copy['call_start_datetime'].dt.hour.fillna(-1).astype(int)
        X_copy['call_day_of_week'] = X_copy['call_start_datetime'].dt.dayofweek.fillna(-1).astype(int)
        X_copy['is_weekend'] = X_copy['call_day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
        return X_copy[['call_day', 'call_hour', 'call_day_of_week', 'is_weekend']]
class RatioFeaturesExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Create the ratio feature for HT_seconds/ST_seconds
        X_copy = X.copy()
        X_copy['ht_st_ratio'] = (X_copy['HT_seconds'] / X_copy['ST_seconds']).replace([np.inf, -np.inf], np.nan)
        X_copy['ht_st_ratio'] =  X_copy['ht_st_ratio'].fillna(0)
        return X_copy[['ht_st_ratio']]
class PrimaryCallReasonCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.replace_dict = {
            'Mileage Plus': 'Mileage Plus',
            'Voluntary Change': 'Voluntary Change',
            'Voluntary Cancel': 'Voluntary Cancel',
            'Check In': 'Check-In',
            'Post Flight': 'Post-Flight',
            'Other Topics': 'Other Topics',
            'Products & Services': 'Products and Services',
            'Digital Support': 'Digital Support',
            'Traveler Updates': 'Traveler Updates',
            '&': 'and',
            np.nan: "Unknown"

        }

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        # Stripping whitespace and replacing multiple spaces with single space
        X_copy['primary_call_reason'] = X_copy['primary_call_reason'].str.strip()
        X_copy['primary_call_reason'] = X_copy['primary_call_reason'].replace(r'\s+', ' ', regex=True)
        # Replacing based on the replace_dict
        X_copy['primary_call_reason'] = X_copy['primary_call_reason'].replace(self.replace_dict)
        return X_copy[['primary_call_reason']]
class ToneEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tone_mapping = {
            'angry': -2,
            'frustrated': -1,
            'neutral': 0,
            'calm': 1,
            'polite': 2,
            np.nan: 0
        }

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        X_copy['customer_tone'] = X_copy['customer_tone'].map(self.tone_mapping)
        X_copy['agent_tone'] = X_copy['agent_tone'].map(self.tone_mapping)
        return X_copy[['customer_tone', 'agent_tone']]

In [None]:
combined['HT_seconds'] = combined['HT'].dt.total_seconds()
combined['ST_seconds'] = combined['ST'].dt.total_seconds()
reason_cleaner = PrimaryCallReasonCleaner()
X = reason_cleaner.fit_transform(combined[['primary_call_reason']])# Target
y = X['primary_call_reason']
X = combined.drop(columns=['primary_call_reason'])  # Feature set



In [None]:
len(X)

71810

In [None]:
X_copy = combined.copy()
X_copy[['primary_call_reason']] = reason_cleaner.fit_transform(combined[['primary_call_reason']])# Target
X_copy.head(10)

Unnamed: 0,call_id,customer_id,agent_id_x,call_start_datetime,agent_assigned_datetime,call_end_datetime,call_transcript,ST,HT,primary_call_reason,agent_id_y,agent_tone,customer_tone,average_sentiment,silence_percent_average,customer_name,elite_level_code,HT_seconds,ST_seconds
0,4667960400,2033123310,963118,2024-07-31 23:56:00,2024-08-01 00:03:00,2024-08-01 00:34:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:07:00,0 days 00:31:00,Voluntary Cancel,963118,neutral,angry,-0.04,0.39,Matthew Foster,4.0,1860.0,420.0
1,1122072124,8186702651,519057,2024-08-01 00:03:00,2024-08-01 00:06:00,2024-08-01 00:18:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:03:00,0 days 00:12:00,Booking,519057,calm,neutral,0.02,0.35,Tammy Walters,,720.0,180.0
2,6834291559,2416856629,158319,2024-07-31 23:59:00,2024-08-01 00:07:00,2024-08-01 00:26:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:19:00,IRROPS,158319,neutral,polite,-0.13,0.32,Jeffery Dixon,,1140.0,480.0
3,2266439882,1154544516,488324,2024-08-01 00:05:00,2024-08-01 00:10:00,2024-08-01 00:17:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:05:00,0 days 00:07:00,Upgrade,488324,neutral,frustrated,-0.2,0.2,David Wilkins,2.0,420.0,300.0
4,1211603231,5214456437,721730,2024-08-01 00:04:00,2024-08-01 00:14:00,2024-08-01 00:23:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:10:00,0 days 00:09:00,Seating,721730,neutral,polite,-0.05,0.35,Elizabeth Daniels,0.0,540.0,600.0
5,5297766997,5590154991,817160,2024-08-01 00:11:00,2024-08-01 00:16:00,2024-08-01 00:40:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:05:00,0 days 00:24:00,Mileage Plus,817160,calm,calm,0.1,0.11,Emily Alexander,5.0,1440.0,300.0
6,324593040,6774865122,519057,2024-08-01 00:08:00,2024-08-01 00:21:00,2024-08-01 00:34:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:13:00,0 days 00:13:00,Checkout,519057,neutral,frustrated,0.0,0.25,James Lawrence,,780.0,780.0
7,8902603117,7974326984,488324,2024-08-01 00:13:00,2024-08-01 00:21:00,2024-08-01 00:29:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:08:00,Mileage Plus,488324,calm,angry,0.11,0.35,Megan Craig,2.0,480.0,480.0
8,7222687732,8023417234,957331,2024-08-01 00:14:00,2024-08-01 00:22:00,2024-08-01 00:35:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:13:00,Mileage Plus,957331,calm,neutral,0.06,0.54,Alexa Martinez,1.0,780.0,480.0
9,4113684017,1528835057,158319,2024-08-01 00:20:00,2024-08-01 00:28:00,2024-08-01 00:37:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:09:00,IRROPS,158319,neutral,angry,-0.19,0.29,Jonathan Parker,,540.0,480.0


In [None]:
X.head(10)

Unnamed: 0,call_id,customer_id,agent_id_x,call_start_datetime,agent_assigned_datetime,call_end_datetime,call_transcript,ST,HT,agent_id_y,agent_tone,customer_tone,average_sentiment,silence_percent_average,customer_name,elite_level_code,HT_seconds,ST_seconds
0,4667960400,2033123310,963118,2024-07-31 23:56:00,2024-08-01 00:03:00,2024-08-01 00:34:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:07:00,0 days 00:31:00,963118,neutral,angry,-0.04,0.39,Matthew Foster,4.0,1860.0,420.0
1,1122072124,8186702651,519057,2024-08-01 00:03:00,2024-08-01 00:06:00,2024-08-01 00:18:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:03:00,0 days 00:12:00,519057,calm,neutral,0.02,0.35,Tammy Walters,,720.0,180.0
2,6834291559,2416856629,158319,2024-07-31 23:59:00,2024-08-01 00:07:00,2024-08-01 00:26:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:19:00,158319,neutral,polite,-0.13,0.32,Jeffery Dixon,,1140.0,480.0
3,2266439882,1154544516,488324,2024-08-01 00:05:00,2024-08-01 00:10:00,2024-08-01 00:17:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:05:00,0 days 00:07:00,488324,neutral,frustrated,-0.2,0.2,David Wilkins,2.0,420.0,300.0
4,1211603231,5214456437,721730,2024-08-01 00:04:00,2024-08-01 00:14:00,2024-08-01 00:23:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:10:00,0 days 00:09:00,721730,neutral,polite,-0.05,0.35,Elizabeth Daniels,0.0,540.0,600.0
5,5297766997,5590154991,817160,2024-08-01 00:11:00,2024-08-01 00:16:00,2024-08-01 00:40:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:05:00,0 days 00:24:00,817160,calm,calm,0.1,0.11,Emily Alexander,5.0,1440.0,300.0
6,324593040,6774865122,519057,2024-08-01 00:08:00,2024-08-01 00:21:00,2024-08-01 00:34:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:13:00,0 days 00:13:00,519057,neutral,frustrated,0.0,0.25,James Lawrence,,780.0,780.0
7,8902603117,7974326984,488324,2024-08-01 00:13:00,2024-08-01 00:21:00,2024-08-01 00:29:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:08:00,488324,calm,angry,0.11,0.35,Megan Craig,2.0,480.0,480.0
8,7222687732,8023417234,957331,2024-08-01 00:14:00,2024-08-01 00:22:00,2024-08-01 00:35:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:13:00,957331,calm,neutral,0.06,0.54,Alexa Martinez,1.0,780.0,480.0
9,4113684017,1528835057,158319,2024-08-01 00:20:00,2024-08-01 00:28:00,2024-08-01 00:37:00,\n\nAgent: Thank you for calling United Airlin...,0 days 00:08:00,0 days 00:09:00,158319,neutral,angry,-0.19,0.29,Jonathan Parker,,540.0,480.0


In [None]:
X['elite_level_code'] = X['elite_level_code'].fillna(-1)
print(X['elite_level_code'].unique())
required = ['call_start_datetime','HT_seconds','ST_seconds','customer_tone','agent_tone','average_sentiment','silence_percent_average','elite_level_code']
X = X[required]

[ 4. -1.  2.  0.  5.  1.  3.]


# **Splitting Test and train data**

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# X_train.head(10)
print(X_train['agent_tone'].isnull().sum())
print(X_train['customer_tone'].isnull().sum())
print(X_train['average_sentiment'].isnull().sum())
print(X_train['silence_percent_average'].isnull().sum())

166
0
84
0


In [None]:
numerical_features = ['average_sentiment', 'silence_percent_average', 'HT_seconds', 'ST_seconds','elite_level_code']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [None]:
# Preprocess categorical features
categorical_features = ['agent_tone', 'customer_tone']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Combine preprocessing steps

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('time_features', TimeFeaturesExtractor(), ['call_start_datetime']),
        ('ratio_features', RatioFeaturesExtractor(), ['HT_seconds', 'ST_seconds']),

        ('tone_encoder', ToneEncoder(), ['customer_tone', 'agent_tone'])
    ])

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
X_train.head(10)

Unnamed: 0,call_start_datetime,HT_seconds,ST_seconds,customer_tone,agent_tone,average_sentiment,silence_percent_average,elite_level_code
15425,2024-08-07 11:57:00,660.0,540.0,calm,neutral,-0.03,0.55,3.0
45701,2024-08-19 18:02:00,360.0,600.0,angry,neutral,-0.16,0.56,-1.0
45470,2024-08-19 16:03:00,420.0,300.0,frustrated,neutral,-0.09,0.28,-1.0
702,2024-08-01 13:42:00,1800.0,420.0,calm,neutral,-0.03,0.62,2.0
13294,2024-08-05 17:14:00,840.0,300.0,angry,neutral,-0.05,0.26,4.0
26262,2024-08-11 13:36:00,420.0,600.0,calm,calm,0.2,0.32,0.0
47713,2024-08-21 11:43:00,660.0,360.0,calm,calm,0.06,0.18,1.0
71332,2024-08-31 18:26:00,240.0,540.0,frustrated,neutral,0.0,0.17,2.0
42947,2024-08-18 15:25:00,120.0,420.0,angry,neutral,-0.04,0.0,-1.0
24202,2024-08-11 07:19:00,120.0,480.0,angry,neutral,0.0,0.06,0.0


In [None]:
# Train the pipeline
pipeline.fit(X_train, y_train)



# **DeBugging Transformers**

In [None]:
# Debugging steps:
X_train_transformed = preprocessor.fit_transform(X_train)

# Check for NaNs in the transformed data
print("NaNs in transformed X_train:", np.isnan(X_train_transformed).sum())

# Check for NaNs in y_train
print("NaNs in y_train:", y_train.isnull().sum()) # Convert to boolean

# If NaNs are found in X_train_transformed:
# 1. Identify the columns with NaNs:
y_train
nan_cols = np.where(np.isnan(X_train_transformed).any(axis=0))[0]
print("Columns with NaNs:", nan_cols)
X_train_transformed

NaNs in transformed X_train: 0
NaNs in y_train: primary_call_reason    0
dtype: int64
Columns with NaNs: []


array([[ 0.0206329 ,  1.37706716, -0.04667909, ...,  1.22222222,
         1.        ,  0.        ],
       [-0.88239055,  1.42918236, -0.43606142, ...,  0.6       ,
        -2.        ,  0.        ],
       [-0.39614716, -0.03004346, -0.35818496, ...,  1.4       ,
        -1.        ,  0.        ],
       ...,
       [ 0.22902292, -0.49908033, -0.59181435, ...,  0.36363636,
         2.        ,  0.        ],
       [ 0.71526632, -0.23850429, -0.59181435, ...,  1.        ,
         2.        ,  1.        ],
       [-0.67400053, -1.48926928, -0.82544375, ...,  0.25      ,
         2.        ,  0.        ]])

In [None]:
X_train_transformed[0]


array([ 0.0206329 ,  1.37706716, -0.04667909,  0.68044736,  1.59693757,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        7.        , 11.        ,  2.        ,  0.        ,  1.22222222,
        1.        ,  0.        ])

In [None]:
# Test the pipeline
y_pred = pipeline.predict(X_test)

# Display the predictions
print("Test Predictions:", y_pred)

Test Predictions: ['Seating' 'IRROPS' 'Checkout' ... 'IRROPS' 'IRROPS' 'IRROPS']


In [None]:
y_ = pipeline.predict(X_test)

In [None]:
print(print('pred',len(X_train)))
print('pred',len(y_t))
print(len(y_test))

pred 50267
None
pred 50267
21543


In [None]:
# Compare with y_test
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")



Accuracy: 0.36
Precision: 0.30
Recall: 0.36
F1 Score: 0.31


In [None]:
pipelineKN = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))  # Starting with k=5
])

In [None]:
 #Train the pipeline
pipelineKN.fit(X_train, y_train)

 #Make predictions
y_pred = pipelineKN.predict(X_test)

#  Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multi-class
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

#  Display the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.20
Precision: 0.18
Recall: 0.20
F1 Score: 0.18


In [None]:
# Define the parameter grid for KNN
param_grid = {'classifier__n_neighbors': range(20,70)}  # Using 'classifier__' to specify the parameter for the KNN step

# Perform GridSearch to find the best k
grid_search = GridSearchCV(pipelineKN, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


In [26]:
grid_search.best_params_

{'classifier__n_neighbors': 61}

In [27]:
pipelineKN = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=61))  # Testing with Grid search params n = 61
])

In [28]:
#Train the pipeline
pipelineKN.fit(X_train, y_train)

 #Make predictions
y_pred = pipelineKN.predict(X_test)

#  Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multi-class
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

#  Display the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.27
Precision: 0.23
Recall: 0.27
F1 Score: 0.18


In [None]:
X['elite_level_code'] = X['elite_level_code'].fillna(-1)
print(X['elite_level_code'].unique())
required = ['call_start_datetime','HT_seconds','ST_seconds','customer_tone','agent_tone','average_sentiment','silence_percent_average','elite_level_code']
X = X[required]

In [38]:
combined.head(10)
input = pd.merge(test, combined, on='call_id', how='inner')
len(input)
input['elite_level_code'] = input['elite_level_code'].fillna(-1)
required = ['call_start_datetime','HT_seconds','ST_seconds','customer_tone','agent_tone','average_sentiment','silence_percent_average','elite_level_code']
input = input[required]
len(input)
# test.head(10)

5157

In [39]:
input.head(10)

Unnamed: 0,call_start_datetime,HT_seconds,ST_seconds,customer_tone,agent_tone,average_sentiment,silence_percent_average,elite_level_code
0,2024-08-01 00:23:00,3480.0,660.0,neutral,neutral,-0.06,0.58,-1.0
1,2024-08-01 01:33:00,780.0,480.0,angry,calm,0.01,0.4,0.0
2,2024-08-01 02:17:00,120.0,600.0,calm,neutral,0.0,0.49,-1.0
3,2024-08-01 02:49:00,240.0,240.0,angry,calm,0.02,0.28,-1.0
4,2024-08-01 02:49:00,240.0,360.0,frustrated,neutral,-0.12,0.12,3.0
5,2024-08-01 03:19:00,300.0,600.0,frustrated,calm,0.11,0.37,3.0
6,2024-08-01 04:17:00,240.0,240.0,calm,neutral,-0.09,0.34,-1.0
7,2024-08-01 04:35:00,300.0,420.0,neutral,neutral,0.0,0.08,2.0
8,2024-08-01 05:03:00,1140.0,300.0,angry,calm,0.01,0.4,3.0
9,2024-08-01 05:52:00,60.0,300.0,angry,neutral,0.0,0.18,-1.0


After Testing multiple classification models we realise RandomForestClassifier offers highest accuracy

In [40]:
y_ = pipeline.predict(input)

In [41]:
test['predictions'] = y_

In [42]:
test.head(10)

Unnamed: 0,call_id,predictions
0,7732610078,Unknown
1,2400299738,Voluntary Change
2,6533095063,Unknown
3,7774450920,Unknown
4,9214147168,Unknown
5,2931134074,Unknown
6,2010588624,Unknown
7,6611494442,Mileage Plus
8,8655666048,Unknown
9,3564872843,Schedule Change


In [43]:
test.to_csv('test.csv', index=False)