In [11]:
from pathlib import Path
import pandas as pd
from utils import *
import json
from json import JSONDecodeError
from mutagen.mp3 import MP3
import pandas as pd
import string
import re

In [20]:
from pathlib import Path
from mutagen.mp3 import MP3

DATA_FOLDER = Path('../../../DATA/2022_1_4/round_2/processed_files/')

transcripts_with_num = []
for file in DATA_FOLDER.rglob('*.txt'):
    audio_file_name = file.parent / f"{'_'.join(file.stem.split('_')[:-1])}.mp3"
    audio = MP3(audio_file_name)
    audio_length = audio.info.length

    with open(file, "r", encoding="utf-8") as transcript_file:
        transcript = transcript_file.read()

        for prefix in ['06', '07']:  # Loop over both prefixes
            index = 0
            while index != -1:
                index = transcript.find(prefix, index)
                if index != -1:
                    start = max(0, index - 50)
                    end = min(len(transcript), index + 52)
                    snippet = transcript[start:end].replace(" ", "")
                    snippet = snippet.translate(str.maketrans('', '', string.punctuation))
                    transcripts_with_num.append((str(audio_file_name.stem), snippet, audio_length))
                    index += 2  # Move past the current prefix to search for the next one

In [21]:
# Updated function to extract the 8 characters following each '06' in the text
def extract_phone_number(text):
    # Regular expression to find '06' or '07' followed by any 8 characters
    phone_regex = r'(06|07)(\d{8})'
    match = re.search(phone_regex, text)

    if match:
        return match.group()  # The whole match including the '06'/'07' prefix
    else:
        return 'NOT_TEL_NUMBER'
# Process the simplified data with the new function
processed_data_06_prefix = [(item[0], extract_phone_number(item[1]), item[2]) for item in transcripts_with_num]

# Function to convert timestamp string to datetime
def convert_timestamp(timestamp_str):
    timestamp_parts = timestamp_str.split('_')
    year, month, day, hour, minute, second = map(int, timestamp_parts[:6])
    channel = timestamp_parts[-1]
    return datetime(year, month, day, hour, minute, second), channel

# Convert the timestamp in phone_numbers_list to datetime
phone_numbers_list = [(convert_timestamp(ts)[0], convert_timestamp(ts)[1], num, dur) for ts, num, dur in processed_data_06_prefix]
phone_numbers_list = list(set(phone_numbers_list))

In [22]:
def format_phone_number(number):
    if pd.isna(number):
        return np.nan
    else:
        return '{:010.0f}'.format(number)

telecom_log_file = "C:/Users/Jean-BaptistePERNEY/Documents/InboundVoiceCalls.csv"
headers = ["CallTime", "CallTimeHour", "CallID", "CallServiceID", "CallCLID", "CallDNIS", "CallDisconnectionStatus", "AgentID", "AgentName", "CallIVRDuration", "CallWaitingDuration", "CallAgentCommunicationDuration", "CallPCPDuration", "CallTransferringServerName", "CallDestinationServerName", "CallNetworkId", "CallToTransfer", "CallUserUserInfo", "CallDisconnectingParty", "Rejected", "CallerFirstName", "CallerLastName", "CallType", "ReturningPartyFirstName", "ReturningPartyLastName","CallDNISAlias"]
telecom_log_df = pd.read_csv(telecom_log_file, names=headers, header=None, sep=";")
telecom_log_df["CallCLID"] = telecom_log_df["CallCLID"].apply(format_phone_number)
telecom_log_df['CallTime'] = pd.to_datetime(telecom_log_df['CallTime'], dayfirst=True)
telecom_log_df = telecom_log_df[telecom_log_df['CallAgentCommunicationDuration'] != 0]
start_date = datetime(2022, 1, 4)
end_date = datetime(2022, 1, 5)
telecom_log_df = telecom_log_df[(telecom_log_df['CallTime'] >= start_date) & (telecom_log_df['CallTime'] < end_date)]

In [23]:
# Extracting valid phone numbers from the list
valid_phone_numbers = [number for _, _, number, _ in phone_numbers_list if number != 'NOT_TEL_NUMBER']

# Filtering the dataframe for rows where CallCLID matches any of the phone numbers in the list
telecom_log_df = telecom_log_df[telecom_log_df["CallCLID"].isin(valid_phone_numbers)]

# Viewing the filtered dataframe
telecom_log_df = telecom_log_df[['CallCLID', 'CallTime', 'AgentID', 'AgentName', 'CallAgentCommunicationDuration']]

In [24]:
# Create a new DataFrame to store the results
results = []

for ts, channel, num, dur in phone_numbers_list:
    # Find the matching row in telecom_log_df
    matching_row = telecom_log_df[telecom_log_df["CallCLID"] == num]

    for _, row in matching_row.iterrows():
        time_diff = (ts - row['CallTime']).total_seconds()
        duration_diff = abs(row['CallAgentCommunicationDuration'] - dur)
        results.append((num, channel, row['AgentID'], row['CallTime'], ts, time_diff, duration_diff))

# Convert results to a DataFrame
results_df = pd.DataFrame(results, columns=['Phone Number', 'Workstastion channel', 'AgentID', 'Telecom Log Timestamp', 'Workstation Timestamp', 'Time Difference (s)', 'Duration Difference (s)'])

# Function to format time difference in a more readable format
def format_time(seconds):
    if seconds < 0:
        sign = "-"
        seconds = -seconds
    else:
        sign = ""

    minutes = int(seconds // 60)
    seconds = seconds % 60

    if minutes > 0:
        return f"{sign}{minutes}min {int(seconds)}s"
    else:
        return f"{sign}{int(seconds)}s"

# Formatting the columns
results_df['Time Difference (s)'] = results_df['Time Difference (s)'].apply(format_time)
results_df['Duration Difference (s)'] = results_df['Duration Difference (s)'].apply(lambda x: f"{int(x // 60)}min {int(x % 60)}s" if x >= 60 else f"{int(x)}s")
results_df = results_df[results_df['Time Difference (s)'].apply(lambda x: not x.startswith('-'))]

results_df.head(50)

Unnamed: 0,Phone Number,Workstastion channel,AgentID,Telecom Log Timestamp,Workstation Timestamp,Time Difference (s),Duration Difference (s)
0,624480237,ch28,dubois1899,2022-01-04 12:01:09,2022-01-04 12:05:46,4min 37s,4s
1,663268302,ch30,torregrossa1897,2022-01-04 11:53:16,2022-01-04 11:55:46,2min 30s,3s
2,688257075,ch15,kosno1889,2022-01-04 10:39:40,2022-01-04 10:59:55,20min 15s,3s
5,613836086,ch11,harend1879,2022-01-04 11:52:39,2022-01-04 11:55:46,3min 7s,14s
6,658344086,ch28,dubois1899,2022-01-04 11:58:17,2022-01-04 12:02:13,3min 56s,3s
7,668221683,ch11,harend1879,2022-01-04 09:00:54,2022-01-04 09:18:52,17min 58s,3s
8,758071330,ch28,dubois1899,2022-01-04 14:41:54,2022-01-04 14:42:26,32s,14s
9,680662969,ch25,caudroy1758,2022-01-04 12:30:43,2022-01-04 12:43:30,12min 47s,4s
10,664875254,ch28,dubois1899,2022-01-04 13:32:30,2022-01-04 13:42:09,9min 39s,4s
12,695831933,ch30,torregrossa1897,2022-01-04 09:06:56,2022-01-04 09:30:11,23min 15s,56s


## Data insights
- Fichiers avec une différence de durée > a quelques secondes ==> fichiers fusionés
- Une ligne télécom log correspond à une conversation complète et pas à un mp3 précisément 
  - Ce qui veut dire qu'une ligne télécom log peut rassembler plusieurs mp3
- Les fichiers mp3 ont l'air d'être enregistrés plus vite la nuit que la journée -> peut être car il a moins d'appels

In [25]:
def channel_agent_schedule(df):
    # Convert timestamps to datetime
    df['Workstation Timestamp'] = pd.to_datetime(df['Workstation Timestamp'])

    # Define time ranges
    morning_start, morning_end = pd.to_datetime("06:00:00").time(), pd.to_datetime("12:00:00").time()
    afternoon_start, afternoon_end = pd.to_datetime("12:00:00").time(), pd.to_datetime("18:00:00").time()

    # Function to determine the part of the day
    def part_of_day(timestamp):
        if morning_start <= timestamp.time() < morning_end:
            return 'morning'
        elif afternoon_start <= timestamp.time() < afternoon_end:
            return 'afternoon'
        else:
            return 'night'

    # Determine the part of the day for each entry
    df['PartOfDay'] = df['Workstation Timestamp'].apply(part_of_day)

    # Group by 'Workstation Channel' and 'PartOfDay' and list unique agents
    schedule = df.groupby(['Workstastion channel', 'PartOfDay'])['AgentID'].unique()

    return schedule

# Call the function with your DataFrame
channel_mapping = channel_agent_schedule(results_df)

channel_mapping

Workstastion channel  PartOfDay
ch11                  morning      [harend1879, sylla1815, larribamarie1772]
ch15                  morning                                    [kosno1889]
ch17                  night                           [sylla1815, bigot1402]
ch25                  afternoon                                [caudroy1758]
ch26                  afternoon              [burette1594, larribamarie1772]
                      night                                    [burette1594]
ch28                  afternoon     [dubois1899, benchaalia1896, titous1564]
                      morning                      [dubois1899, thyreau1755]
ch30                  morning                              [torregrossa1897]
ch9                   afternoon                                  [roffe1445]
                      morning                                   [titous1564]
Name: AgentID, dtype: object