In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
final_master_df = pd.read_csv("final_master_cleaned.csv")

In [3]:
final_master_df.head()

Unnamed: 0,call_id,customer_id,agent_id,call_start_datetime,agent_assigned_datetime,call_end_datetime,call_transcript,customer_name,elite_level_code,primary_call_reason,agent_tone,customer_tone,average_sentiment,silence_percent_average,handle_time,speed_to_answer
0,4667960400,2033123310,963118,2024-07-31 23:56:00,2024-08-01 00:03:00,2024-08-01 00:34:00,agent: thank you for calling united airlines c...,matthew foster,4.0,voluntary cancel,neutral,angry,-0.04,0.39,1860.0,420.0
1,1122072124,8186702651,519057,2024-08-01 00:03:00,2024-08-01 00:06:00,2024-08-01 00:18:00,"agent: thank you for calling united airlines, ...",tammy walters,1.0,booking,calm,neutral,0.02,0.35,720.0,180.0
2,6834291559,2416856629,158319,2024-07-31 23:59:00,2024-08-01 00:07:00,2024-08-01 00:26:00,agent: thank you for calling united airlines c...,jeffery dixon,1.0,irrops,neutral,polite,-0.13,0.32,1140.0,480.0
3,2266439882,1154544516,488324,2024-08-01 00:05:00,2024-08-01 00:10:00,2024-08-01 00:17:00,agent: thank you for calling united airlines c...,david wilkins,2.0,upgrade,neutral,frustrated,-0.2,0.2,420.0,300.0
4,1211603231,5214456437,721730,2024-08-01 00:04:00,2024-08-01 00:14:00,2024-08-01 00:23:00,agent: thank you for calling united airlines c...,elizabeth daniels,0.0,seating,neutral,polite,-0.05,0.35,540.0,600.0


In [4]:
# Cleaning function
def clean_text(text):
    # Remove special characters, numbers, and lowercasing
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

In [5]:
final_master_df['cleaned_transcript'] = final_master_df['call_transcript'].apply(clean_text)

In [6]:
final_master_df.head()

Unnamed: 0,call_id,customer_id,agent_id,call_start_datetime,agent_assigned_datetime,call_end_datetime,call_transcript,customer_name,elite_level_code,primary_call_reason,agent_tone,customer_tone,average_sentiment,silence_percent_average,handle_time,speed_to_answer,cleaned_transcript
0,4667960400,2033123310,963118,2024-07-31 23:56:00,2024-08-01 00:03:00,2024-08-01 00:34:00,agent: thank you for calling united airlines c...,matthew foster,4.0,voluntary cancel,neutral,angry,-0.04,0.39,1860.0,420.0,agent thank you for calling united airlines cu...
1,1122072124,8186702651,519057,2024-08-01 00:03:00,2024-08-01 00:06:00,2024-08-01 00:18:00,"agent: thank you for calling united airlines, ...",tammy walters,1.0,booking,calm,neutral,0.02,0.35,720.0,180.0,agent thank you for calling united airlines my...
2,6834291559,2416856629,158319,2024-07-31 23:59:00,2024-08-01 00:07:00,2024-08-01 00:26:00,agent: thank you for calling united airlines c...,jeffery dixon,1.0,irrops,neutral,polite,-0.13,0.32,1140.0,480.0,agent thank you for calling united airlines cu...
3,2266439882,1154544516,488324,2024-08-01 00:05:00,2024-08-01 00:10:00,2024-08-01 00:17:00,agent: thank you for calling united airlines c...,david wilkins,2.0,upgrade,neutral,frustrated,-0.2,0.2,420.0,300.0,agent thank you for calling united airlines cu...
4,1211603231,5214456437,721730,2024-08-01 00:04:00,2024-08-01 00:14:00,2024-08-01 00:23:00,agent: thank you for calling united airlines c...,elizabeth daniels,0.0,seating,neutral,polite,-0.05,0.35,540.0,600.0,agent thank you for calling united airlines cu...


In [7]:
# Define additional stop words
additional_stopwords = [
    'thank', 'calling', 'united', 'airlines', 'customer', 'service', 'name', 
    'help', 'today', 'welcome', 'appreciate', 'business', 'flying', 'hi', 
    'yeah', 'okay', 'alright', 'ugh', 'sigh', 'pause', 'typing', 'noises', 
    'really', 'hopefully', 'um', 'hmm', 'understand', 'look', 'check', 
    'see', 'take', 'make', 'get', 'give', 'switch', 'hear', 'think', 
    'frustrating', 'ridiculous', 'unfortunately', 'problem', 'hoping', 
    'ideal', 'able'
]
extra_stopwords = ['calling', 'typing', 'okay', 'thanks', 'youre', 'yeah', 'really', 'work', 'day', 'need']
# Combine all stop words into a single list
total_stopwords = additional_stopwords + extra_stopwords + list(stopwords.words('english'))

# Vectorizing the cleaned transcripts with combined stop words
vectorizer = TfidfVectorizer(stop_words=total_stopwords, max_features=5000)  # Limit to top 5000 features
X = vectorizer.fit_transform(final_master_df['cleaned_transcript'])

In [8]:
y = final_master_df['primary_call_reason']

In [9]:
# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [11]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance classes
smote = SMOTE(random_state=41, k_neighbors=1)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Convert string labels to numeric using LabelEncoder
label_encoder = LabelEncoder()
y_resampled_encoded = label_encoder.fit_transform(y_resampled)
y_test_encoded = label_encoder.transform(y_test)

# Create XGBoost classifier
xgb_clf = xgb.XGBClassifier(
    objective='multi:softmax',  # Multiclass classification
    eval_metric='mlogloss',    # Avoid encoding warning
    random_state=42,
    n_estimators=50
)

# Train XGBoost classifier on resampled data
xgb_clf.fit(X_resampled, y_resampled_encoded)

# Predict on test data
y_pred = xgb_clf.predict(X_test)

# Convert predictions back to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred)


In [12]:
# Load the test file with only call_id
test_data = pd.read_csv("testbc7185d.csv")

# Load the full dataset that contains features for each call_id
full_data = pd.read_csv("final_master_cleaned.csv")

# Merge test_data with full_data to get features for test set
test_features = test_data.merge(full_data, on="call_id", how="left")

In [13]:
# Optional: Predict on test dataset
test_df = pd.read_csv('testbc7185d.csv')  # Load the test data
test_df['cleaned_transcript'] = test_features['call_transcript'].apply(clean_text)
test_X = vectorizer.transform(test_df['cleaned_transcript'])

In [14]:
# Make predictions
test_df['predicted_call_reason'] = xgb_clf.predict(test_X)
# Convert predictions back to original labels
test_df["predicted_call_reason"] = label_encoder.inverse_transform(test_df['predicted_call_reason'])


In [15]:
test_df

Unnamed: 0,call_id,cleaned_transcript,predicted_call_reason
0,7732610078,agent thank you for calling united airlines cu...,irrops
1,2400299738,agent thank you for calling united airlines my...,irrops
2,6533095063,agent thank you for calling united airlines cu...,irrops
3,7774450920,agent thank you for calling united airlines th...,irrops
4,9214147168,agent thank you for calling united airlines cu...,irrops
...,...,...,...
5152,5300201106,agent thank you for calling united airlines cu...,irrops
5153,727694488,agent thank you for calling united airlines my...,irrops
5154,147487837,agent thank you for calling united airlines cu...,irrops
5155,5330794838,agent thank you for calling united airlines my...,irrops


In [16]:
# Save predictions to CSV
test_df.to_csv('test_vansh_singh.csv', index=False)