In [1]:
import numpy as np
import pandas as pd 
from tqdm import tqdm
from dateutil import parser

In [2]:
df = pd.read_json('./calllog.json')
print(df.shape)
df.head()

(16877, 4)


Unnamed: 0,number,calltype,duration,date
0,7958309153,incomming-missed,0,2020-02-13 07:30:04.464000+00:00
1,7958309153,outgoing-rejected,0,2020-06-12 01:29:16.039000+00:00
2,7958309153,incomming-connected,4797,2020-10-07 01:03:33.778000+00:00
3,7958309153,incomming-connected,4741,2020-09-05 12:46:02.822000+00:00
4,7958309153,incomming-connected,2082,2020-09-13 04:05:35.482000+00:00


In [3]:
df.calltype.unique()

array(['incomming-missed', 'outgoing-rejected', 'incomming-connected',
       'incomming-rejected', 'outgoing-connected',
       'outgoing-notconnected'], dtype=object)

In [4]:
df['callStatus1'] = df.calltype.apply(lambda x: x.split('-')[0])
df['callStatus2'] = df.calltype.apply(lambda x: x.split('-')[1])
df['timeStampMilli'] = df.date.apply(lambda x: parser.parse(str(x)).microsecond)
df.head(n=5)

Unnamed: 0,number,calltype,duration,date,callStatus1,callStatus2,timeStampMilli
0,7958309153,incomming-missed,0,2020-02-13 07:30:04.464000+00:00,incomming,missed,464000
1,7958309153,outgoing-rejected,0,2020-06-12 01:29:16.039000+00:00,outgoing,rejected,39000
2,7958309153,incomming-connected,4797,2020-10-07 01:03:33.778000+00:00,incomming,connected,778000
3,7958309153,incomming-connected,4741,2020-09-05 12:46:02.822000+00:00,incomming,connected,822000
4,7958309153,incomming-connected,2082,2020-09-13 04:05:35.482000+00:00,incomming,connected,482000


### Number

In [5]:
number = np.array(df.number.unique())
number.shape

(1000,)

### Frequency

In [6]:
total_outgoing = df['callStatus1'].value_counts()[0]  #Total Outgoing Calls
total_incoming = df['callStatus1'].value_counts()[1]  #Total Incoming Calls

In [7]:
def frequency(number):
    """ Calculates Frequency for outgoing call from A to B """
    
    C_AB = 0
    C_AK = total_outgoing
    sub_df = df[df['number'] == number]
    C_AB = sub_df[sub_df['callStatus1'] == 'outgoing'].shape[0]
                
    return C_AB/C_AK

### Intimacy

In [8]:
D_AK = df.groupby(['callStatus1'])['duration'].sum()[1]  #total outgoing call duration
D_KA = df.groupby(['callStatus1'])['duration'].sum()[0]  #total incoming call duration

In [9]:
def intimacy(number):
    """ Calculates Intimacy for outgoing call from A to B """
    
    tendency = D_AK/(D_AK + D_KA)
    sub_df = df[df['number'] == number]
    
    #Duration of outgoing calling 
    try: 
        D_AB = sub_df.groupby(['callStatus1'])['duration'].sum()[1] 
    except:
        D_AB = 0
        
    #Duration of incoming calling
    try:
        D_BA = sub_df.groupby(['callStatus1'])['duration'].sum()[0]
    except:
        D_BA = 0
    
    S_AB = D_AB/(D_AK + D_KA)
    R_BA = D_BA/(D_AK + D_KA)
    
    return tendency * S_AB + (1-tendency) * R_BA

### Recency

In [10]:
T_AK_Max = max(df.timeStampMilli)/1000   #Max Timestamp
T_AK_Min = min(df.timeStampMilli)/1000   #Min Timestamp

In [11]:
def recency(number):
    sub_df = df[df['number'] == number]
    sub_df = sub_df.sort_values('date')
    
    T_AB = sub_df.iloc[-1,6]/1000    #timestamp for recent call between A to B
    
    return (T_AB - T_AK_Min)/(T_AK_Max-T_AK_Min)


### Build Dataset with Features

In [12]:
Values = np.zeros((1000,4))
Values.shape

(1000, 4)

In [13]:
col_names = ['Number','Frequency','Intimancy','Recency']
callLogDataframe = pd.DataFrame(data=Values, columns=col_names)
callLogDataframe.head()

Unnamed: 0,Number,Frequency,Intimancy,Recency
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0


In [14]:
for i in tqdm(range(1000)):
    callLogDataframe.iloc[i,0] = number[i]
    callLogDataframe.iloc[i,1] = frequency(callLogDataframe.iloc[i,0])
    callLogDataframe.iloc[i,2] = intimacy(callLogDataframe.iloc[i,0])
    callLogDataframe.iloc[i,3] = recency(callLogDataframe.iloc[i,0])

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:09<00:00, 110.49it/s]


In [15]:
callLogDataframe.head()

Unnamed: 0,Number,Frequency,Intimancy,Recency
0,7958309000.0,0.000235,0.00059,0.812813
1,9154024000.0,0.000353,0.000186,0.880881
2,9133218000.0,0.001766,0.00084,0.94995
3,9032735000.0,0.000118,0.0,0.231231
4,6964498000.0,0.001059,0.00031,0.652653


In [16]:
callLogDataframe.to_csv('Call_Log_Feature.csv',index=False)

### Calculate Alpha, Beta & Gamma

In [17]:
#Calculate Alpha
total_call = len(df)
alpha = total_outgoing/total_call

#Calculate Beta
mean_duration = np.mean(df['duration'])
std_callDuration = np.std(df['duration'])
NMD = mean_duration + 3 * std_callDuration

beta = mean_duration/NMD

#Calculate Gamma
gamma = (1/1000)*(np.sum(callLogDataframe['Recency'])/(T_AK_Max - T_AK_Min))