In [1]:
import os
import logging
import pandas as pd

# Setup logging
if not os.path.exists("logs"):
    os.makedirs("logs")
logging.basicConfig(
    filename='logs/pipeline.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [2]:
logging.info("Creating object for DPDClass")
# Import the DPDClass module
import DPDClass
dpd = DPDClass.DPDClass()

1. Data Ingestion and Validation: 

In [3]:
#For the file call_logs.csv

call_logs = dpd.ingest_validate_data(
    data_path="datas/call_logs.csv",
    primary_key_cols=['agent_id', 'org_id', 'call_date']
)
call_logs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   call_id         500 non-null    object 
 1   agent_id        500 non-null    object 
 2   org_id          500 non-null    object 
 3   installment_id  500 non-null    object 
 4   status          500 non-null    object 
 5   duration        500 non-null    float64
 6   created_ts      500 non-null    object 
 7   call_date       500 non-null    object 
dtypes: float64(1), object(7)
memory usage: 31.4+ KB


In [4]:
#For the file agent_roster.csv

agent_roster = dpd.ingest_validate_data(
    data_path="datas/agent_roster.csv",
    primary_key_cols=['agent_id', 'org_id']
)
agent_roster.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   agent_id               20 non-null     object
 1   users_first_name       20 non-null     object
 2   users_last_name        20 non-null     object
 3   users_office_location  20 non-null     object
 4   org_id                 20 non-null     object
dtypes: object(5)
memory usage: 932.0+ bytes


In [5]:
#For the file disposition_summary.csv

disposition_summary = dpd.ingest_validate_data(
    data_path="datas/disposition_summary.csv",
    primary_key_cols=['agent_id', 'org_id', 'call_date']
)
disposition_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   agent_id    20 non-null     object
 1   org_id      20 non-null     object
 2   call_date   20 non-null     object
 3   login_time  17 non-null     object
dtypes: object(4)
memory usage: 772.0+ bytes


2. Join Logics

In [6]:
#Merging/joining the dataframes

logging.info("Merging dataframes call_logs, agent_roster")
merged_data = dpd.merge_dataframes(call_logs,agent_roster,on_cols=['agent_id', 'org_id'], mtype='left')

logging.info("Merging dataframes previously merged with disposition_summary")
merged_data = dpd.merge_dataframes(merged_data,disposition_summary,on_cols=['agent_id', 'org_id', 'call_date'], mtype='left')
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   call_id                500 non-null    object 
 1   agent_id               500 non-null    object 
 2   org_id                 500 non-null    object 
 3   installment_id         500 non-null    object 
 4   status                 500 non-null    object 
 5   duration               500 non-null    float64
 6   created_ts             500 non-null    object 
 7   call_date              500 non-null    object 
 8   users_first_name       500 non-null    object 
 9   users_last_name        500 non-null    object 
 10  users_office_location  500 non-null    object 
 11  login_time             416 non-null    object 
dtypes: float64(1), object(11)
memory usage: 47.0+ KB


3. Feature Engineering:

In [7]:
#Calculating the agent performance summary

agent_performance_summary = dpd.feature_engg(merged_data)
print(agent_performance_summary.info())
agent_performance_summary.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   agent_id          20 non-null     object 
 1   users_first_name  20 non-null     object 
 2   users_last_name   20 non-null     object 
 3   call_date         20 non-null     object 
 4   total_calls       20 non-null     int64  
 5   unique_loans      20 non-null     int64  
 6   completed_calls   20 non-null     int64  
 7   avg_duration_min  20 non-null     float64
 8   presence          20 non-null     int32  
 9   connect_rate      20 non-null     float64
dtypes: float64(2), int32(1), int64(3), object(4)
memory usage: 1.6+ KB
None


Unnamed: 0,agent_id,users_first_name,users_last_name,call_date,total_calls,unique_loans,completed_calls,avg_duration_min,presence,connect_rate
0,A001,AgentFirst1,AgentLast1,2025-04-28,20,20,2,0.11,1,0.1
1,A002,AgentFirst2,AgentLast2,2025-04-28,23,23,3,0.13,1,0.13
2,A003,AgentFirst3,AgentLast3,2025-04-28,21,21,8,0.12,1,0.38
3,A004,AgentFirst4,AgentLast4,2025-04-28,27,27,4,0.13,1,0.15
4,A005,AgentFirst5,AgentLast5,2025-04-28,29,28,4,0.12,0,0.14


4. Output

In [8]:
#Saving the final dataframe to a CSV file

if not os.path.exists("result"):
    os.makedirs("result")
agent_performance_summary.to_csv('result/agent_performance_summary.csv', index=False)
logging.info(f"Saved csv report to 'result/agent_performance_summary.csv'")

In [12]:
#Getting the slack summary message

summary_message = dpd.slack_summary_message(agent_performance_summary, agent_performance_summary['call_date'].max())
print(summary_message)

Agent Summary for 2025-04-28
Top Performer AgentFirst3 AgentLast3 (38% connect rate)
Total Active Agents: 17
Average Duration: 0.12
