In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Step 1: Load the datasets
calls = pd.read_csv('calls.csv')
customers = pd.read_csv('customers.csv')
reasons = pd.read_csv('reason.csv')
sentiment_statistics = pd.read_csv('sentiment_statistics.csv')
# test_data = pd.read_csv('test.csv')

# Step 2: Data Cleaning (removing rows with null values)
calls.dropna(inplace=True)
reasons.dropna(inplace=True)
sentiment_statistics.dropna(inplace=True)

In [3]:
import pandas as pd
import re

# Function to clean spaces
def clean_spaces(text):
    if isinstance(text, str):
        # Remove leading and trailing spaces
        text = text.strip()
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text)
    return text

# Apply the cleaning function to the primary_call_reason column
reasons['primary_call_reason'] = reasons['primary_call_reason'].apply(clean_spaces)

# Mapping dictionary for similar phrases
mapping = {
    'Post Flight': 'Post-Flight',
    'Products & Services': 'Products and Services',
    'Check In': 'Check-In'
}

# Replace similar reasons using the mapping
reasons['primary_call_reason'] = reasons['primary_call_reason'].replace(mapping)

# Display the cleaned reasons
print("Cleaned Primary Call Reasons:")
print(reasons)

Cleaned Primary Call Reasons:
          call_id primary_call_reason
0      4667960400    Voluntary Cancel
1      1122072124             Booking
2      6834291559              IRROPS
3      2266439882             Upgrade
4      1211603231             Seating
...           ...                 ...
66648  7569738090        Mileage Plus
66649  1563273072         Post-Flight
66650  8865997781             Upgrade
66651  8019240181             Upgrade
66652  8210720833     Digital Support

[66653 rows x 2 columns]


In [None]:
# Step 3: Merge datasets (on 'call_id' and other keys)
merged_data = pd.merge(calls, reasons, on='call_id')
merged_data = pd.merge(merged_data, sentiment_statistics, on='call_id')
merged_data = pd.merge(merged_data, customers, on='customer_id')

# Step 4: Further Data Cleaning for merged dataset
merged_data.dropna(subset=['primary_call_reason', 'call_transcript'], inplace=True)

# Step 5: Feature Engineering
merged_data['call_start_datetime'] = pd.to_datetime(merged_data['call_start_datetime'])
merged_data['call_end_datetime'] = pd.to_datetime(merged_data['call_end_datetime'])
merged_data['call_duration'] = (merged_data['call_end_datetime'] - merged_data['call_start_datetime']).dt.total_seconds()

merged_data['call_hour'] = merged_data['call_start_datetime'].dt.hour

def get_time_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

merged_data['time_of_day'] = merged_data['call_hour'].apply(get_time_of_day)

# Step 6: Label Encoding for categorical features
label_encoder = LabelEncoder()

# Encoding 'customer_tone', 'agent_tone', 'time_of_day'
merged_data['customer_tone_encoded'] = label_encoder.fit_transform(merged_data['customer_tone'])
merged_data['agent_tone_encoded'] = label_encoder.fit_transform(merged_data['agent_tone'])
merged_data['time_of_day_encoded'] = label_encoder.fit_transform(merged_data['time_of_day'])

# Step 7: Text Analysis (using TF-IDF for call transcripts)
tfidf = TfidfVectorizer(max_features=100, stop_words='english')
transcript_tfidf = tfidf.fit_transform(merged_data['call_transcript'])

transcript_df = pd.DataFrame(transcript_tfidf.toarray(), columns=tfidf.get_feature_names_out())
merged_data = pd.concat([merged_data, transcript_df], axis=1)

# Step 8: Model Training and Predictive Analysis
features = ['call_duration', 'silence_percent_average', 'customer_tone_encoded', 'agent_tone_encoded','elite_level_code','average_sentiment',] + list(transcript_df.columns)

# Fill any missing values in numerical columns with 0 (as a fallback step for model compatibility)
merged_data[features] = merged_data[features].fillna(0)

X = merged_data[features]
y = merged_data['primary_call_reason']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Step 9: Model Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load all necessary datasets
test_data = pd.read_csv('test.csv')  # test.csv only contains call_id
calls_data = pd.read_csv('calls.csv')  # Contains call-related features
customers_data = pd.read_csv('customers.csv')  # Contains customer-related features
reasons_data = pd.read_csv('reason.csv')  # Contains primary_call_reason
sentiment_data = pd.read_csv('sentiment_statistics.csv')  # Sentiment related features

# Step 2: Merge test data with other datasets using `call_id` (join on call_id)
# This assumes each call_id in the test file exists in calls_data, customers_data, etc.
test_data = test_data.merge(calls_data, on='call_id', how='left')
test_data = test_data.merge(customers_data, on='customer_id', how='left')
test_data = test_data.merge(sentiment_data, on='call_id', how='left')

# Step 3: Feature Engineering (same as training data)
test_data['call_start_datetime'] = pd.to_datetime(test_data['call_start_datetime'])
test_data['call_end_datetime'] = pd.to_datetime(test_data['call_end_datetime'])

# Calculate call_duration
test_data['call_duration'] = (test_data['call_end_datetime'] - test_data['call_start_datetime']).dt.total_seconds()

# Creating call_hour and time_of_day features
test_data['call_hour'] = test_data['call_start_datetime'].dt.hour

def get_time_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

test_data['time_of_day'] = test_data['call_hour'].apply(get_time_of_day)

# Step 4: Label Encoding for categorical columns ('customer_tone', 'agent_tone', 'time_of_day')
label_encoder = LabelEncoder()
test_data['customer_tone_encoded'] = label_encoder.fit_transform(test_data['customer_tone'].fillna(''))
test_data['agent_tone_encoded'] = label_encoder.fit_transform(test_data['agent_tone'].fillna(''))
test_data['time_of_day_encoded'] = label_encoder.fit_transform(test_data['time_of_day'].fillna(''))

# Step 5: TF-IDF transformation for the call transcript
# Use the same TF-IDF vectorizer that was fitted on training data
tfidf = TfidfVectorizer(max_features=100, stop_words='english')

# For this example, you'd fit the vectorizer using training data (recreate or load the vectorizer)
train_transcripts = merged_data['call_transcript'].fillna('')
tfidf.fit(train_transcripts)

# Transform the test transcripts using the same TF-IDF vectorizer
test_transcript_tfidf = tfidf.transform(test_data['call_transcript'].fillna(''))
test_transcript_df = pd.DataFrame(test_transcript_tfidf.toarray(), columns=tfidf.get_feature_names_out())

# Step 6: Concatenate the test TF-IDF features with the original test DataFrame
test_data = pd.concat([test_data, test_transcript_df], axis=1)

# Step 7: Ensure that the test dataset contains the same features as the training set
features = ['call_duration', 'silence_percent_average', 'customer_tone_encoded', 'agent_tone_encoded','elite_level_code','average_sentiment',] + list(transcript_df.columns)

# Since test_data doesn't have some of these features, you can fill them with defaults (e.g., 0)
test_data['silence_percent_average'] = test_data['silence_percent_average'].fillna(0)

# Ensure no NaNs in the features columns
test_data[features] = test_data[features].fillna(0)

# Predict the primary call reason for test data
test_predictions = model.predict(test_data[features])

# Step 9: Save the predictions in the required format
output = pd.DataFrame({'call_id': test_data['call_id'], 'primary_call_reason': test_predictions})

# Save to CSV
output.to_csv('test_predictions.csv', index=False)
