### Add scripts path to the notebook

In [1]:
import sys
import os

current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

scripts_path = os.path.join(parent_dir, 'scripts')

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
sys.path.insert(0, scripts_path)

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

d:\KifiyaAIM-Course\Week - 2\TellCo-Profitability-Opportunity-Analysis\notebooks


### Import Statements

In [3]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from scripts.database_client import DB_Client

### Initialize DB_Client

In [4]:
# obtain values form environment variables
host = os.getenv("DB_HOST")
user_name = os.getenv("DB_USER")
passowrd = os.getenv("DB_PASSWORD")
port = os.getenv("DB_PORT")
database = os.getenv("DB_NAME")

In [5]:
db_client = DB_Client(
    host=host,
    user_name=user_name,
    password=passowrd,
    port=port,
    database_name=database
)

### Load the data from the database

In [6]:
data = db_client.dump_data()

  response = pd.read_sql_query(sql=query, con=self.connection)


### Data Cleaning

In [7]:
data.isna().mean()

Bearer Id                                   0.006607
Start                                       0.000007
Start ms                                    0.000007
End                                         0.000007
End ms                                      0.000007
Dur. (ms)                                   0.000007
IMSI                                        0.003800
MSISDN/Number                               0.007107
IMEI                                        0.003813
Last Location Name                          0.007687
Avg RTT DL (ms)                             0.185525
Avg RTT UL (ms)                             0.185412
Avg Bearer TP DL (kbps)                     0.000007
Avg Bearer TP UL (kbps)                     0.000007
TCP DL Retrans. Vol (Bytes)                 0.587636
TCP UL Retrans. Vol (Bytes)                 0.644322
DL TP < 50 Kbps (%)                         0.005027
50 Kbps < DL TP < 250 Kbps (%)              0.005027
250 Kbps < DL TP < 1 Mbps (%)               0.

As described in the _user_overview_analysis.ipynb_ since the data has a lot of missing vlaues in some columns droping the rows isn't the best option. So only cleanning the features that are important/relevant to the current task is the better option. The features of interest(columns) with null values are put in a list and the cleaning only happens in those columns, just for the specific task

In [8]:
columns_of_interest = ["MSISDN/Number", "IMSI", "Bearer Id", "Start", "Start ms", "End", "End ms", "Dur. (ms)", "Dur. (ms).1"]
data[columns_of_interest].isna().mean()

MSISDN/Number    0.007107
IMSI             0.003800
Bearer Id        0.006607
Start            0.000007
Start ms         0.000007
End              0.000007
End ms           0.000007
Dur. (ms)        0.000007
Dur. (ms).1      0.000007
dtype: float64

I have decided to drop the missing values found in the given columns because the amount of data we loose is very little

In [9]:
data.dropna(subset=columns_of_interest, inplace=True)

### Tracking user engagement 

1) Group data based on the MSISDN (phone number)

In [12]:
customer_grouping = data.groupby(by="MSISDN/Number")

2) Aggregate the required data. Since every entry in the database is a session we have to group them by the phone numebr, which we did above, and then calculate the total frequency by adding the unique _Bearer Id / sessionId_, summing up the download and upload byets for the sessions, and also adding the durations per sessions

In [37]:
customer_data = customer_grouping.agg({
    "Bearer Id": "count", # count the amount of session per sim card
    "Dur. (ms)": "sum", # calculate the total duration per sim card in the data
    "Total UL (Bytes)":  "sum", #calculate the total upload per sim card
    "Total DL (Bytes)": "sum", # calculate the total download per sim card 
})

# finding the total trafic(sum between total download and uplaod)
customer_data["traffic"] = customer_data["Total UL (Bytes)"] + customer_data["Total DL (Bytes)"]

# renaming the fields
customer_data.rename(columns={
    "Bearer Id": "session_freq",
    "Dur. (ms)": "duration",
    "Total UL (Bytes)": "upload_tot",
    "Total DL (Bytes)": "download_tot"
}, inplace=True)

In [38]:
customer_data.columns

Index(['session_freq', 'duration', 'upload_tot', 'download_tot', 'traffic'], dtype='object')

3) Reporting the top 10 customers per metric

In [42]:
# find the top 10 users for session frequency
frequency_order = customer_data.sort_values(by="session_freq", ascending=False)
frequency_order.head(10)

Unnamed: 0_level_0,session_freq,duration,upload_tot,download_tot,traffic
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
33626320000.0,18,8791927.0,669650721.0,7301517000.0,7971167000.0
33614890000.0,17,9966898.0,689483001.0,8156743000.0,8846226000.0
33625780000.0,17,18553754.0,729577380.0,7770043000.0,8499621000.0
33659730000.0,16,4035428.0,624260321.0,7081602000.0,7705863000.0
33760540000.0,15,9279434.0,703478581.0,7811295000.0,8514774000.0
33675880000.0,15,4865947.0,581568792.0,7309542000.0,7891111000.0
33667160000.0,13,8744914.0,566326364.0,5052068000.0,5618394000.0
33760410000.0,12,5321667.0,521518890.0,6610852000.0,7132371000.0
33786320000.0,12,4059652.0,466330265.0,5155901000.0,5622232000.0
33627080000.0,12,4703516.0,445251947.0,5309479000.0,5754731000.0


In [44]:
# find the top 10 users for session duration
duration_order = customer_data.sort_values(by="duration", ascending=False)
duration_order.head(10)

Unnamed: 0_level_0,session_freq,duration,upload_tot,download_tot,traffic
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
33625780000.0,17,18553754.0,729577380.0,7770043000.0,8499621000.0
33614890000.0,17,9966898.0,689483001.0,8156743000.0,8846226000.0
33760540000.0,15,9279434.0,703478581.0,7811295000.0,8514774000.0
33626320000.0,18,8791927.0,669650721.0,7301517000.0,7971167000.0
33667160000.0,13,8744914.0,566326364.0,5052068000.0,5618394000.0
33662840000.0,9,6614270.0,406424372.0,3593674000.0,4000098000.0
33664690000.0,9,6288730.0,402180804.0,5279827000.0,5682007000.0
33603130000.0,12,6287761.0,435587631.0,4540607000.0,4976195000.0
33667460000.0,6,5649882.0,266222647.0,2334299000.0,2600522000.0
33760410000.0,12,5321667.0,521518890.0,6610852000.0,7132371000.0


In [45]:
# find the top 10 users by total traffic 
traffic_order = customer_data.sort_values(by="traffic", ascending=False)
traffic_order.head(10)

Unnamed: 0_level_0,session_freq,duration,upload_tot,download_tot,traffic
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
33614890000.0,17,9966898.0,689483001.0,8156743000.0,8846226000.0
33760540000.0,15,9279434.0,703478581.0,7811295000.0,8514774000.0
33625780000.0,17,18553754.0,729577380.0,7770043000.0,8499621000.0
33626320000.0,18,8791927.0,669650721.0,7301517000.0,7971167000.0
33675880000.0,15,4865947.0,581568792.0,7309542000.0,7891111000.0
33659730000.0,16,4035428.0,624260321.0,7081602000.0,7705863000.0
33666460000.0,11,4536757.0,405060976.0,6903440000.0,7308501000.0
33760410000.0,12,5321667.0,521518890.0,6610852000.0,7132371000.0
33664710000.0,11,2927785.0,471244453.0,6400774000.0,6872018000.0
33698790000.0,11,5169128.0,530343105.0,6010556000.0,6540899000.0
