In [1]:
import pandas as pd
import numpy as np 

In [2]:
def load_dataset(path):
    df = pd.read_csv(path)
    return df

df = load_dataset('phone_data.csv')
df.head()


Unnamed: 0,index,date,duration,item,month,network,network_type
0,0,15/10/14 06:58,34.429,data,2014-11,data,data
1,1,15/10/14 06:58,13.0,call,2014-11,Vodafone,mobile
2,2,15/10/14 14:46,23.0,call,2014-11,Meteor,mobile
3,3,15/10/14 14:48,4.0,call,2014-11,Tesco,mobile
4,4,15/10/14 17:27,4.0,call,2014-11,Tesco,mobile


In [3]:
def basic_info(df):
    info = {
        'head': df.head(),
        'info': df.info(),
        'describe': df.describe(),
        'null_counts': df.isnull().sum()
    }
    return info

info = basic_info(df)
print(info['head'])
print(info['describe'])
print(info['null_counts'])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 830 entries, 0 to 829
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         830 non-null    int64  
 1   date          830 non-null    object 
 2   duration      830 non-null    float64
 3   item          830 non-null    object 
 4   month         830 non-null    object 
 5   network       830 non-null    object 
 6   network_type  830 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 45.5+ KB
   index            date  duration  item    month   network network_type
0      0  15/10/14 06:58    34.429  data  2014-11      data         data
1      1  15/10/14 06:58    13.000  call  2014-11  Vodafone       mobile
2      2  15/10/14 14:46    23.000  call  2014-11    Meteor       mobile
3      3  15/10/14 14:48     4.000  call  2014-11     Tesco       mobile
4      4  15/10/14 17:27     4.000  call  2014-11     Tesco       mobile
        

In [4]:
def unique_counts(df):
    return df.nunique()

unique_counts(df)


index           830
date            747
duration        221
item              3
month             5
network           9
network_type      6
dtype: int64

In [5]:
def total_calls(df):
    return len(df[df['item'] == 'call'])

total_calls(df)

388

In [6]:
def total_call_duration(df):
    return df.loc[df['item']== 'call','duration'].sum()

total_call_duration


<function __main__.total_call_duration(df)>

In [7]:
def total_sms(df):
    return len(df[df['item'] == 'sms'])

total_sms(df)

292

In [8]:
def most_used_network(df):
    return df['network'].mode()[0]

most_used_network(df)

'Three'

In [9]:
def average_call_duration(df):
    return df.loc[df['item'] == 'call', 'duration'].mean()

average_call_duration(df)


np.float64(237.94072164948454)

In [10]:
def top_month(df):
    df['month'] = pd.to_datetime(df['date']).dt.month
    return df['month'].mode()[0]

top_month(df)


  df['month'] = pd.to_datetime(df['date']).dt.month


np.int32(12)

In [11]:
def duration_by_network(df):
    return df.groupby('network')['duration'].sum()

duration_by_network(df)


network
Meteor        7233.00
Tesco        13841.00
Three        36551.00
Vodafone     14770.00
data          5164.35
landline     18433.00
special          3.00
voicemail     1775.00
world            7.00
Name: duration, dtype: float64

In [12]:
def top_call_day(df):
    df['day'] = pd.to_datetime(df['date']).dt.date
    calls_per_day = df[df['item']== 'call'].groupby('day').size()
    return calls_per_day
top_call_day(df)

  df['day'] = pd.to_datetime(df['date']).dt.date


day
2014-01-11    2
2014-02-11    5
2014-02-12    1
2014-03-11    8
2014-03-12    2
             ..
2015-09-02    3
2015-10-01    5
2015-11-01    1
2015-12-01    3
2015-12-02    2
Length: 118, dtype: int64

In [13]:
def percentage_by_item(df):
    total = len(df)
    return (df['item'].value_counts() / total) * 100

percentage_by_item(df)

item
call    46.746988
sms     35.180723
data    18.072289
Name: count, dtype: float64

In [14]:
def categorize_call_type(df):
    def call_type(duration):
        if duration < 1:
            return 'short'
        elif duration <= 5:
            return 'medium'
        else:
            return 'long'
    df['call_type'] = df.apply(lambda x: call_type(x['duration']) if x['item'] == 'call' else np.nan, axis=1)
    return df

df = categorize_call_type(df)
df.head()

Unnamed: 0,index,date,duration,item,month,network,network_type,day,call_type
0,0,15/10/14 06:58,34.429,data,10,data,data,2014-10-15,
1,1,15/10/14 06:58,13.0,call,10,Vodafone,mobile,2014-10-15,long
2,2,15/10/14 14:46,23.0,call,10,Meteor,mobile,2014-10-15,long
3,3,15/10/14 14:48,4.0,call,10,Tesco,mobile,2014-10-15,medium
4,4,15/10/14 17:27,4.0,call,10,Tesco,mobile,2014-10-15,medium


In [15]:
def avg_duration_per_network(df):
    return df[df['item'] == 'call'].groupby('network')['duration'].mean()

avg_duration_per_network(df)

network
Meteor       133.333333
Tesco        194.760563
Three        284.875000
Vodafone     221.530303
landline     438.880952
voicemail     65.740741
Name: duration, dtype: float64

In [16]:
def top_avg_duration_network(df):
    avg_dur = avg_duration_per_network(df)
    return avg_dur.idxmax()

top_avg_duration_network(df)

'landline'

In [17]:
def comms_per_date(df):
    df['date_only'] = pd.to_datetime(df['date']).dt.date
    return df.groupby('date_only').size()

comms_per_date(df)

  df['date_only'] = pd.to_datetime(df['date']).dt.date


date_only
2014-01-11     3
2014-01-12     3
2014-02-11     6
2014-02-12     2
2014-03-11    11
              ..
2015-11-02     1
2015-11-03     1
2015-12-01     6
2015-12-02     3
2015-12-03     1
Length: 151, dtype: int64

In [18]:
def total_duration_per_day(df):
    df['date_only'] = pd.to_datetime(df['date']).dt.date
    return df.groupby('date_only')['duration'].sum()

total_duration_per_day(df)


  df['date_only'] = pd.to_datetime(df['date']).dt.date


date_only
2014-01-11     993.429
2014-01-12      36.429
2014-02-11    2718.429
2014-02-12     560.429
2014-03-11     440.429
                ...   
2015-11-02      34.429
2015-11-03      34.429
2015-12-01      65.429
2015-12-02     189.429
2015-12-03      34.429
Name: duration, Length: 151, dtype: float64

In [19]:
def most_active_hour(df):
    df['hour'] = pd.to_datetime(df['date']).dt.hour
    return df['hour'].mode()[0]

most_active_hour(df)


  df['hour'] = pd.to_datetime(df['date']).dt.hour


np.int32(6)

In [20]:
def item_network_pivot(df):
    return pd.pivot_table(df, index='network', columns='item', aggfunc='size', fill_value=0)

item_network_pivot(df)

item,call,data,sms
network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Meteor,54,0,33
Tesco,71,0,13
Three,128,0,87
Vodafone,66,0,149
data,0,150,0
landline,42,0,0
special,0,0,3
voicemail,27,0,0
world,0,0,7


In [21]:
def rolling_avg_duration(df):
    df['date_only'] = pd.to_datetime(df['date']).dt.date
    calls = df[df['item'] == 'call'].groupby('date_only')['duration'].sum()
    return calls.rolling(2).mean()

rolling_avg_duration(df)

  df['date_only'] = pd.to_datetime(df['date']).dt.date


date_only
2014-01-11       NaN
2014-02-11    1821.5
2014-02-12    1605.0
2014-03-11     465.0
2014-03-12     815.5
               ...  
2015-09-02     782.5
2015-10-01    1428.5
2015-11-01     793.5
2015-12-01     115.0
2015-12-02      92.0
Name: duration, Length: 118, dtype: float64

In [22]:
def duration_anomalies(df):
    calls = df[df['item'] == 'call']['duration']
    mean, std = calls.mean(), calls.std()
    return df[(df['item'] == 'call') & ((df['duration'] > mean + 3 * std) | (df['duration'] < mean - 3 * std))]

duration_anomalies(df)

Unnamed: 0,index,date,duration,item,month,network,network_type,day,call_type,date_only,hour
742,742,17/02/15 19:09,2328.0,call,2,Three,mobile,2015-02-17,long,2015-02-17,19
816,816,4/3/2015 12:29,10528.0,call,4,landline,landline,2015-04-03,long,2015-04-03,12


In [23]:
def monthly_summary(df):
    df['month'] = pd.to_datetime(df['date']).dt.month
    summary = df.groupby(['month', 'item']).agg(
        count=('item', 'size'),
        total_duration=('duration', 'sum')
    ).reset_index()
    return summary

monthly_summary(df)

  df['month'] = pd.to_datetime(df['date']).dt.month


Unnamed: 0,month,item,count,total_duration
0,1,call,54,13068.0
1,1,data,24,826.296
2,1,sms,29,29.0
3,2,call,47,12029.0
4,2,data,21,723.009
5,2,sms,23,23.0
6,3,call,18,5121.0
7,3,data,6,206.574
8,3,sms,6,6.0
9,4,call,19,11360.0


In [24]:
def cumulative_duration(df):
    df['date'] = pd.to_datetime(df['date'])
    calls = df[df['item'] == 'call'].sort_values('date')
    calls['cumulative_duration'] = calls['duration'].cumsum()
    return calls[['date', 'cumulative_duration']]

cumulative_duration(df)

  df['date'] = pd.to_datetime(df['date'])


Unnamed: 0,date,cumulative_duration
112,2014-01-11 15:13:00,955.0
113,2014-01-11 17:54:00,959.0
115,2014-02-11 14:34:00,1418.0
116,2014-02-11 15:44:00,2441.0
117,2014-02-11 19:16:00,3466.0
...,...,...
572,2015-12-01 12:01:00,92155.0
573,2015-12-01 12:01:00,92162.0
574,2015-12-01 18:23:00,92166.0
729,2015-12-02 20:15:00,92235.0


In [25]:
def longest_same_network(df):
    max_streak = (df['network'] != df['network'].shift()).cumsum()
    streak_counts = max_streak.value_counts().max()
    return streak_counts

longest_same_network(df)

np.int64(19)

In [26]:
def time_differences(df):
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date')
    df['time_diff'] = df['date'].diff()
    return df[['date', 'time_diff']]

time_differences(df).head()

Unnamed: 0,date,time_diff
111,2014-01-11 06:58:00,NaT
112,2014-01-11 15:13:00,0 days 08:15:00
113,2014-01-11 17:54:00,0 days 02:41:00
338,2014-01-12 06:58:00,0 days 13:04:00
339,2014-01-12 12:51:00,0 days 05:53:00


In [27]:
def categorize_activity(df):
    comms = df.groupby(pd.to_datetime(df['date']).dt.date).size()
    threshold = comms.mean()
    return comms.apply(lambda x: 'high activity' if x > threshold else 'low activity')

categorize_activity(df)

date
2014-01-11     low activity
2014-01-12     low activity
2014-02-11    high activity
2014-02-12     low activity
2014-03-11    high activity
                  ...      
2015-11-02     low activity
2015-11-03     low activity
2015-12-01    high activity
2015-12-02     low activity
2015-12-03     low activity
Length: 151, dtype: object

In [28]:
def top_network_by_item(df):
    return df.groupby('item')['network'].agg(lambda x: x.value_counts().idxmax())

top_network_by_item(df)

item
call       Three
data        data
sms     Vodafone
Name: network, dtype: object

In [29]:
def call_sms_ratio(df):
    calls = df[df['item'] == 'call'].groupby('network').size()
    sms = df[df['item'] == 'sms'].groupby('network').size()
    ratio = (calls / sms).fillna(0)
    return ratio

call_sms_ratio(df)

network
Meteor       1.636364
Tesco        5.461538
Three        1.471264
Vodafone     0.442953
landline     0.000000
special      0.000000
voicemail    0.000000
world        0.000000
dtype: float64

In [30]:
def network_summary(df):
    grouped = df.groupby('network').agg(
        total_calls=('item', lambda x: (x == 'call').sum()),
        total_sms=('item', lambda x: (x == 'sms').sum()),
        total_duration=('duration', 'sum'),
        avg_duration=('duration', 'mean')
    )
    return grouped

network_summary(df)

Unnamed: 0_level_0,total_calls,total_sms,total_duration,avg_duration
network,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Meteor,54,33,7233.0,83.137931
Tesco,71,13,13841.0,164.77381
Three,128,87,36551.0,170.004651
Vodafone,66,149,14770.0,68.697674
data,0,0,5164.35,34.429
landline,42,0,18433.0,438.880952
special,0,3,3.0,1.0
voicemail,27,0,1775.0,65.740741
world,0,7,7.0,1.0
