### Add scripts path to the notebook

In [63]:
import sys
import os

current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

scripts_path = os.path.join(parent_dir, 'scripts')

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
sys.path.insert(0, scripts_path)

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

d:\KifiyaAIM-Course\Week - 2\TellCo-Profitability-Opportunity-Analysis\notebooks


### Import Statements

In [64]:
import math
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer

from scripts.database_client import DB_Client
from scripts.data_cleaner import DataCleaner
from scripts.user_satisfaction import UserStatisfactionCalculator

### Initialize DB_Client

In [65]:
# obtain values form environment variables
host = os.getenv("DB_HOST")
user_name = os.getenv("DB_USER")
passowrd = os.getenv("DB_PASSWORD")
port = os.getenv("DB_PORT")
database = os.getenv("DB_NAME")

In [66]:
db_client = DB_Client(
    host=host,
    user_name=user_name,
    password=passowrd,
    port=port,
    database_name=database
)

### Load the data from the database

In [67]:
data = db_client.dump_data()

  response = pd.read_sql_query(sql=query, con=self.connection)


### Clean the data

1) Initialize the data cleaner

In [68]:
cleaner = DataCleaner(data=data)

2) Clean the experience data

In [69]:
# define the columns of interest
columns_of_interest = ["MSISDN/Number", "Avg RTT DL (ms)", "Avg RTT UL (ms)", "TCP DL Retrans. Vol (Bytes)", "TCP UL Retrans. Vol (Bytes)", "Handset Type", "Avg Bearer TP DL (kbps)", "Avg Bearer TP UL (kbps)"]

# define columns where we will use mode to replace the NA values
mode_columns = ["MSISDN/Number", "Handset Type"]

# define columns where we will use mean to replace the NA values
mean_columns = [col for col in columns_of_interest if col not in mode_columns]

In [70]:
# clean the categorical data(ones who use mode for their NA)
data[mode_columns] = cleaner.fill_na(columns=mode_columns, method='mode')

# clean the numeric data(ones who use mean for their NA)
data[mean_columns] = cleaner.fill_na(columns=mean_columns, method='mean')

# print out the final result
data[[*mean_columns, *mode_columns]].isna().mean()

Avg RTT DL (ms)                0.0
Avg RTT UL (ms)                0.0
TCP DL Retrans. Vol (Bytes)    0.0
TCP UL Retrans. Vol (Bytes)    0.0
Avg Bearer TP DL (kbps)        0.0
Avg Bearer TP UL (kbps)        0.0
MSISDN/Number                  0.0
Handset Type                   0.0
dtype: float64

3) Clean the engagement data

In [71]:
columns_of_interest = ["Start", "Start ms", "End", "End ms", "Dur. (ms)", "Dur. (ms).1"]

In [72]:
cleaner.find_na()

Bearer Id                                   0.006607
Start                                       0.000007
Start ms                                    0.000007
End                                         0.000007
End ms                                      0.000007
Dur. (ms)                                   0.000007
IMSI                                        0.003800
MSISDN/Number                               0.000000
IMEI                                        0.003813
Last Location Name                          0.007687
Avg RTT DL (ms)                             0.000000
Avg RTT UL (ms)                             0.000000
Avg Bearer TP DL (kbps)                     0.000000
Avg Bearer TP UL (kbps)                     0.000000
TCP DL Retrans. Vol (Bytes)                 0.000000
TCP UL Retrans. Vol (Bytes)                 0.000000
DL TP < 50 Kbps (%)                         0.005027
50 Kbps < DL TP < 250 Kbps (%)              0.005027
250 Kbps < DL TP < 1 Mbps (%)               0.

In [73]:
data = cleaner.drop_na()
data.isna().mean()

Bearer Id                                   0.0
Start                                       0.0
Start ms                                    0.0
End                                         0.0
End ms                                      0.0
Dur. (ms)                                   0.0
IMSI                                        0.0
MSISDN/Number                               0.0
IMEI                                        0.0
Last Location Name                          0.0
Avg RTT DL (ms)                             0.0
Avg RTT UL (ms)                             0.0
Avg Bearer TP DL (kbps)                     0.0
Avg Bearer TP UL (kbps)                     0.0
TCP DL Retrans. Vol (Bytes)                 0.0
TCP UL Retrans. Vol (Bytes)                 0.0
DL TP < 50 Kbps (%)                         0.0
50 Kbps < DL TP < 250 Kbps (%)              0.0
250 Kbps < DL TP < 1 Mbps (%)               0.0
DL TP > 1 Mbps (%)                          0.0
UL TP < 10 Kbps (%)                     

### Assign Satisfaction score to users

In [74]:
satisfaction_calc = UserStatisfactionCalculator(data=data)

1) Assign experience score for users

In [75]:
# obtain the experience clusters
experience_clusters = satisfaction_calc.get_experience_cluster()

Created a normalizer


In [76]:
experience_score = satisfaction_calc.claculate_experience_score()

2) Assign engagement score for users

In [77]:
# obtain the experience clusters
engagement_clusters = satisfaction_calc.get_engagement_cluster()

Created a normalizer


In [78]:
engagement_score = satisfaction_calc.calculate_engagement_score()

3) Calculate the satisfaction score of users using engagement score and experience score and print top 10 customers

In [79]:
# calculate the satisfaction scores of the users
satisfaction_scores = satisfaction_calc.get_satifisfaction_score(engagemet_score=engagement_score['engagement_score'], experience_score=experience_score['experience_score'])

- Add the metrics to a new dataframe

In [85]:
engagement_score['experience_score'] = experience_score['experience_score']
engagement_score['satisfaction_score'] = satisfaction_scores

unwanted_cols = [col for col in engagement_score.columns if col not in ['experience_score', 'satisfaction_score', 'engagement_score']]
user_data = engagement_score.drop(columns=unwanted_cols)

Print the top 10 users with regards to satisfaction score

In [91]:
user_data.sort_values(by='satisfaction_score', ascending=False).head(10)

Unnamed: 0_level_0,engagement_score,experience_score,satisfaction_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33626320000.0,44337290000.0,3938102.0,22170620000.0
33659550000.0,6712507000.0,2171065.0,3357339000.0
33699250000.0,5151779000.0,17328020.0,2584554000.0
33650730000.0,5155759000.0,7295040.0,2581527000.0
33661800000.0,5111951000.0,5005221.0,2558478000.0
33662580000.0,5107097000.0,3966170.0,2555531000.0
33762640000.0,4491789000.0,587998900.0,2539894000.0
33669690000.0,4948515000.0,3338826.0,2475927000.0
33658130000.0,4895647000.0,1900829.0,2448774000.0
33699170000.0,4861821000.0,15613130.0,2438717000.0


4) Apply k means of k=2 for both experinece score and satisfaction score

- Initialize clusterer

In [93]:
clusterer = KMeans(n_clusters=2, random_state=7, init='k-means++', n_init=50)

- Classify based on experience score

In [97]:
experience_clusters = clusterer.fit(user_data.drop(columns=['engagement_score', 'satisfaction_score']))

- Classify based on satisfaction score

In [100]:
satisfaction_clusters = clusterer.fit(user_data.drop(columns=['engagement_score', 'experience_score']))

5) Aggregate the average satisfaction & experience score

6)  Build a regression model that can predict the satisfaction score of a user

7) Export the resulting user id and satisfaction, engagement and experience scores 