### Add scripts path to the notebook

In [18]:
import sys
import os

current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

scripts_path = os.path.join(parent_dir, 'scripts')

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
sys.path.insert(0, scripts_path)

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

d:\KifiyaAIM-Course\Week - 2\TellCo-Profitability-Opportunity-Analysis\notebooks


### Import Statements

In [19]:
import math
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer

from scripts.database_client import DB_Client
from scripts.data_cleaner import DataCleaner
from scripts.user_satisfaction import UserStatisfactionCalculator

### Initialize DB_Client

In [20]:
# obtain values form environment variables
host = os.getenv("DB_HOST")
user_name = os.getenv("DB_USER")
passowrd = os.getenv("DB_PASSWORD")
port = os.getenv("DB_PORT")
database = os.getenv("DB_NAME")

In [21]:
db_client = DB_Client(
    host=host,
    user_name=user_name,
    password=passowrd,
    port=port,
    database_name=database
)

### Load the data from the database

In [22]:
data = db_client.dump_data()

  response = pd.read_sql_query(sql=query, con=self.connection)


### Clean the data

1) Initialize the data cleaner

In [23]:
cleaner = DataCleaner(data=data)

2) Clean the experience data

In [24]:
# define the columns of interest
columns_of_interest = ["MSISDN/Number", "Avg RTT DL (ms)", "Avg RTT UL (ms)", "TCP DL Retrans. Vol (Bytes)", "TCP UL Retrans. Vol (Bytes)", "Handset Type", "Avg Bearer TP DL (kbps)", "Avg Bearer TP UL (kbps)"]

# define columns where we will use mode to replace the NA values
mode_columns = ["MSISDN/Number", "Handset Type"]

# define columns where we will use mean to replace the NA values
mean_columns = [col for col in columns_of_interest if col not in mode_columns]

In [25]:
# clean the categorical data(ones who use mode for their NA)
data[mode_columns] = cleaner.fill_na(columns=mode_columns, method='mode')

# clean the numeric data(ones who use mean for their NA)
data[mean_columns] = cleaner.fill_na(columns=mean_columns, method='mean')

# print out the final result
data[[*mean_columns, *mode_columns]].isna().mean()

Avg RTT DL (ms)                0.0
Avg RTT UL (ms)                0.0
TCP DL Retrans. Vol (Bytes)    0.0
TCP UL Retrans. Vol (Bytes)    0.0
Avg Bearer TP DL (kbps)        0.0
Avg Bearer TP UL (kbps)        0.0
MSISDN/Number                  0.0
Handset Type                   0.0
dtype: float64

3) Clean the engagement data

In [26]:
columns_of_interest = ["Start", "Start ms", "End", "End ms", "Dur. (ms)", "Dur. (ms).1"]

In [27]:
cleaner.find_na()

Bearer Id                                   0.006607
Start                                       0.000007
Start ms                                    0.000007
End                                         0.000007
End ms                                      0.000007
Dur. (ms)                                   0.000007
IMSI                                        0.003800
MSISDN/Number                               0.000000
IMEI                                        0.003813
Last Location Name                          0.007687
Avg RTT DL (ms)                             0.000000
Avg RTT UL (ms)                             0.000000
Avg Bearer TP DL (kbps)                     0.000000
Avg Bearer TP UL (kbps)                     0.000000
TCP DL Retrans. Vol (Bytes)                 0.000000
TCP UL Retrans. Vol (Bytes)                 0.000000
DL TP < 50 Kbps (%)                         0.005027
50 Kbps < DL TP < 250 Kbps (%)              0.005027
250 Kbps < DL TP < 1 Mbps (%)               0.

In [28]:
data = cleaner.drop_na()
data.isna().mean()

Bearer Id                                   0.0
Start                                       0.0
Start ms                                    0.0
End                                         0.0
End ms                                      0.0
Dur. (ms)                                   0.0
IMSI                                        0.0
MSISDN/Number                               0.0
IMEI                                        0.0
Last Location Name                          0.0
Avg RTT DL (ms)                             0.0
Avg RTT UL (ms)                             0.0
Avg Bearer TP DL (kbps)                     0.0
Avg Bearer TP UL (kbps)                     0.0
TCP DL Retrans. Vol (Bytes)                 0.0
TCP UL Retrans. Vol (Bytes)                 0.0
DL TP < 50 Kbps (%)                         0.0
50 Kbps < DL TP < 250 Kbps (%)              0.0
250 Kbps < DL TP < 1 Mbps (%)               0.0
DL TP > 1 Mbps (%)                          0.0
UL TP < 10 Kbps (%)                     

### Assign Satisfaction score to users

In [29]:
satisfaction_calc = UserStatisfactionCalculator(data=data)

1) Assign experience score for users

In [30]:
# obtain the experience clusters
experience_clusters = satisfaction_calc.obtain_experience_cluster()

Created a normalizer


In [31]:
experience_score = satisfaction_calc.claculate_experience_score()

2) Assign engagement score for users

In [32]:
# obtain the experience clusters
experience_clusters = satisfaction_calc.obtain_engagement_cluster()

Created a normalizer


In [33]:
experience_score = satisfaction_calc.calculate_engagement_score()