### Add scripts path to the notebook

In [1]:
import sys
import os

current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

scripts_path = os.path.join(parent_dir, 'scripts')

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
sys.path.insert(0, scripts_path)

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

d:\KifiyaAIM-Course\Week - 2\TellCo-Profitability-Opportunity-Analysis\notebooks


### Import statements

In [2]:
import math
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt


from scripts.database_client import DB_Client
from scripts.data_cleaner import DataCleaner

### Initialize DB_Client

In [3]:
# obtain values form environment variables
host = os.getenv("DB_HOST")
user_name = os.getenv("DB_USER")
passowrd = os.getenv("DB_PASSWORD")
port = os.getenv("DB_PORT")
database = os.getenv("DB_NAME")

In [4]:
db_client = DB_Client(
    host=host,
    user_name=user_name,
    password=passowrd,
    port=port,
    database_name=database
)

### Load data from the database

In [5]:
data = db_client.dump_data()

  response = pd.read_sql_query(sql=query, con=self.connection)


### Data Cleaning

1) Initialize the cleaner class

In [6]:
cleaner = DataCleaner(data=data)

2) Get the empty values

In [7]:
cleaner.find_na()

Bearer Id                                   0.006607
Start                                       0.000007
Start ms                                    0.000007
End                                         0.000007
End ms                                      0.000007
Dur. (ms)                                   0.000007
IMSI                                        0.003800
MSISDN/Number                               0.007107
IMEI                                        0.003813
Last Location Name                          0.007687
Avg RTT DL (ms)                             0.185525
Avg RTT UL (ms)                             0.185412
Avg Bearer TP DL (kbps)                     0.000007
Avg Bearer TP UL (kbps)                     0.000007
TCP DL Retrans. Vol (Bytes)                 0.587636
TCP UL Retrans. Vol (Bytes)                 0.644322
DL TP < 50 Kbps (%)                         0.005027
50 Kbps < DL TP < 250 Kbps (%)              0.005027
250 Kbps < DL TP < 1 Mbps (%)               0.

3) Define columns of interest and then replace the NA values with either the mode or the mean of the columns

In [10]:
# define the columns of interest
columns_of_interest = ["MSISDN/Number", "Avg RTT DL (ms)", "Avg RTT UL (ms)", "TCP DL Retrans. Vol (Bytes)", "TCP UL Retrans. Vol (Bytes)", "Handset Type", "Avg Bearer TP DL (kbps)", "Avg Bearer TP UL (kbps)"]

# now let us define columns where we will use mode to replace the NA values
mode_columns = ["MSISDN/Number", "Handset Type"]

# now let us define columns where we will use mean to replace the NA values
mean_columns = [col for col in columns_of_interest if col not in mode_columns]

In [11]:
# clean the categorical data(ones who use mode for their NA)
data[mode_columns] = cleaner.fill_na(columns=mode_columns, method='mode')

In [12]:
# clean the numeric data(ones who use mean for their NA)
data[mean_columns] = cleaner.fill_na(columns=mean_columns, method='mean')

In [14]:
data[[*mean_columns, *mode_columns]].isna().mean()

Avg RTT DL (ms)                0.0
Avg RTT UL (ms)                0.0
TCP DL Retrans. Vol (Bytes)    0.0
TCP UL Retrans. Vol (Bytes)    0.0
Avg Bearer TP DL (kbps)        0.0
Avg Bearer TP UL (kbps)        0.0
MSISDN/Number                  0.0
Handset Type                   0.0
dtype: float64

We can see that our cleaning is complete for the columns of interest

### Aggregate the columns of interest per customer ("MSISDN/Number")