In [1]:
import pickle
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from outliener import Outlier
from visualization import *

In [25]:
user_engagement_df = pd.read_csv("../data/user_engagement_data.csv")
user_engagement_df.head()

Unnamed: 0,MSISDN/Number,cluster,sessions,duration,Total Data Volume (Bytes)
0,33601000000.0,0,1.0,116720.0,878690600.0
1,33601000000.0,1,1.0,181230.0,156859600.0
2,33601000000.0,1,1.0,134969.0,595966500.0
3,33601010000.0,3,1.0,49878.0,422320700.0
4,33601010000.0,2,2.0,37104.0,1457411000.0


In [26]:
user_experince_df = pd.read_csv("../data/user_experience_data.csv")
user_experince_df.head()

Unnamed: 0,MSISDN/Number,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes),Handset Type
0,33601000000.0,46.0,76.0,2895381.0,Huawei P20 Lite Huawei Nova 3E
1,33601000000.0,31.0,99.0,9272453.0,Apple iPhone 7 (A1778)
2,33601000000.0,59.0,97.0,4150403.0,unknown
3,33601010000.0,84.0,248.0,2396.0,Apple iPhone 5S (A1457)
4,33601010000.0,119.0,56844.0,9738882.0,Apple iPhone Se (A1723)


### Engagement Score

In [7]:
eng_df = user_engagement_df.set_index('MSISDN/Number')[
    ['sessions', 'duration', 'Total Data Volume (Bytes)']]

In [9]:
# scale data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(eng_df)
scaled_data

array([[-0.54866182, -0.15663495,  0.54827382],
       [-0.54866182,  0.5847055 , -1.28048981],
       [-0.54866182,  0.05308016, -0.16800961],
       ...,
       [-0.54866182, -1.39672249,  0.13265472],
       [-0.54866182,  0.12224972, -0.46103285],
       [-0.54866182,  1.40982166, -0.16719549]])

In [10]:
normalized_data = normalize(scaled_data)
normalized_data

array([[-0.69336056, -0.19794433,  0.69287024],
       [-0.36315599,  0.38701309, -0.84754858],
       [-0.95210963,  0.09211163, -0.29155221],
       ...,
       [-0.3642029 , -0.9271474 ,  0.08805649],
       [-0.7546951 ,  0.16815689, -0.63415974],
       [-0.36047983,  0.926276  , -0.10985018]])

## Task 4.1

Engagement score to each user. Consider the engagement score as the Euclidean distance between the user data point & the less engaged cluster(use the first clustering for this)(Euclidean Distance)

In [11]:
with open("../models/user_engagement.pkl", "rb") as f:
    kmeans1 = pickle.load(f)

In [12]:
less_engaged_cluster = 3

Calculate and assign the distance between the centroid and samples

In [15]:
distance = kmeans1.fit_transform(normalized_data)
distance_from_less_engaged_cluster = list(
    map(lambda x: x[less_engaged_cluster], distance))
user_engagement_df['engagement_score'] = distance_from_less_engaged_cluster
user_engagement_df.head()

Unnamed: 0,MSISDN/Number,cluster,sessions,duration,Total Data Volume (Bytes),engagement_score
0,33601000000.0,0,1.0,116720.0,878690600.0,1.397666
1,33601000000.0,1,1.0,181230.0,156859600.0,0.94919
2,33601000000.0,1,1.0,134969.0,595966500.0,0.873277
3,33601010000.0,3,1.0,49878.0,422320700.0,0.2615
4,33601010000.0,2,2.0,37104.0,1457411000.0,1.688294


#### Experience Score

experience score to each user. Consider the experience score as the Euclidean distance between the user data point & the worst experience’s cluster.

In [27]:
exp_df = user_experince_df.set_index('MSISDN/Number')[
     ['Total Avg RTT (ms)', 'Total Avg Bearer TP (kbps)', 'Total TCP Retrans. Vol (Bytes)']]

In [35]:
# scale data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(exp_df)
pd.DataFrame(scaled_data).head()

Unnamed: 0,0,1,2
0,-0.170156,-0.494272,-0.108602
1,-0.191909,-0.493732,-0.079508
2,-0.151304,-0.493779,-0.102876
3,-0.11505,-0.490235,-0.1218
4,-0.064295,0.838385,-0.07738


In [36]:
normalized_data = normalize(scaled_data)
pd.DataFrame(normalized_data).head()

Unnamed: 0,0,1,2
0,-0.318703,-0.925771,-0.203411
1,-0.358272,-0.921743,-0.148432
2,-0.28733,-0.937697,-0.195363
3,-0.222073,-0.946262,-0.235101
4,-0.076142,0.992877,-0.091639


In [37]:
with open("../models/user_experience.pkl", "rb") as f:
    kmeans2 = pickle.load(f)

In [38]:

worst_experience_cluster = 1

In [39]:
distance = kmeans2.fit_transform(normalized_data)
distance_from_worst_experience_cluster = list(
    map(lambda x: x[worst_experience_cluster], distance))
user_experince_df['experience_score'] = distance_from_worst_experience_cluster
user_experince_df.head()

Unnamed: 0,MSISDN/Number,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes),Handset Type,experience_score
0,33601000000.0,46.0,76.0,2895381.0,Huawei P20 Lite Huawei Nova 3E,0.04697
1,33601000000.0,31.0,99.0,9272453.0,Apple iPhone 7 (A1778),0.100926
2,33601000000.0,59.0,97.0,4150403.0,unknown,0.053453
3,33601010000.0,84.0,248.0,2396.0,Apple iPhone 5S (A1457),0.092104
4,33601010000.0,119.0,56844.0,9738882.0,Apple iPhone Se (A1723),1.89903


### Satisfaction Score

In [40]:
user_engagement_df.rename(columns={'cluster': 'engagement_cluster'}, inplace=True)
user_engagement_df.head()

Unnamed: 0,MSISDN/Number,engagement_cluster,sessions,duration,Total Data Volume (Bytes)
0,33601000000.0,0,1.0,116720.0,878690600.0
1,33601000000.0,1,1.0,181230.0,156859600.0
2,33601000000.0,1,1.0,134969.0,595966500.0
3,33601010000.0,3,1.0,49878.0,422320700.0
4,33601010000.0,2,2.0,37104.0,1457411000.0


In [41]:
user_experince_df.rename(columns={'cluster': 'experience_cluster'}, inplace=True)
user_experince_df.head()

Unnamed: 0,MSISDN/Number,Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes),Handset Type,experience_score
0,33601000000.0,46.0,76.0,2895381.0,Huawei P20 Lite Huawei Nova 3E,0.04697
1,33601000000.0,31.0,99.0,9272453.0,Apple iPhone 7 (A1778),0.100926
2,33601000000.0,59.0,97.0,4150403.0,unknown,0.053453
3,33601010000.0,84.0,248.0,2396.0,Apple iPhone 5S (A1457),0.092104
4,33601010000.0,119.0,56844.0,9738882.0,Apple iPhone Se (A1723),1.89903


In [42]:
user_satisfaction_df = pd.merge(user_engagement_df, user_experince_df, on='MSISDN/Number')
user_satisfaction_df['satisfaction_score'] = (
    user_satisfaction_df['experience_score'] + user_satisfaction_df['experience_score'])/2
user_satisfaction_df.head()

Unnamed: 0,MSISDN/Number,engagement_cluster,sessions,duration,Total Data Volume (Bytes),Total Avg RTT (ms),Total Avg Bearer TP (kbps),Total TCP Retrans. Vol (Bytes),Handset Type,experience_score,satisfaction_score
0,33601000000.0,0,1.0,116720.0,878690600.0,46.0,76.0,2895381.0,Huawei P20 Lite Huawei Nova 3E,0.04697,0.04697
1,33601000000.0,1,1.0,181230.0,156859600.0,31.0,99.0,9272453.0,Apple iPhone 7 (A1778),0.100926,0.100926
2,33601000000.0,1,1.0,134969.0,595966500.0,59.0,97.0,4150403.0,unknown,0.053453,0.053453
3,33601010000.0,3,1.0,49878.0,422320700.0,84.0,248.0,2396.0,Apple iPhone 5S (A1457),0.092104,0.092104
4,33601010000.0,2,2.0,37104.0,1457411000.0,119.0,56844.0,9738882.0,Apple iPhone Se (A1723),1.89903,1.89903


In [43]:

user_satisfaction_df = user_satisfaction_df[['MSISDN/Number', 'experience_score',
                        'experience_score', 'satisfaction_score']]
user_satisfaction_df.set_index('MSISDN/Number', inplace=True)
user_satisfaction_df.head()

Unnamed: 0_level_0,experience_score,experience_score,satisfaction_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33601000000.0,0.04697,0.04697,0.04697
33601000000.0,0.100926,0.100926,0.100926
33601000000.0,0.053453,0.053453,0.053453
33601010000.0,0.092104,0.092104,0.092104
33601010000.0,1.89903,1.89903,1.89903


In [46]:
sorted_satisfied = user_satisfaction_df.sort_values('satisfaction_score', ascending=False)
top10_satisfied  =sorted_satisfied['satisfaction_score'].head(10)

In [47]:
hist(top10_satisfied)

### Linear Regression for satisfaction score prediction

In [33]:
# splitting training and testing data
X = user_satisfaction_df[['experience_score', 'experience_score']]
y = user_satisfaction_df[['satisfaction_score']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
model = LinearRegression()
model.fit(X_train, y_train)

In [35]:

y_pred = model.predict(X_test)

In [36]:
print(f'Intercept: {model.intercept_}')
print(f'Coefficients: { model.coef_}')
print(f"Mean squared error: {np.mean((y_pred - y_test.values) ** 2)}")

Intercept: [-1.33226763e-15]
Coefficients: [[0.25 0.25 0.25 0.25]]
Mean squared error: 0.0


### K-means Clusturing

In [38]:
# scale data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(user_satisfaction_df)
scaled_data

array([[ 0.4522495 ,  0.4522495 ,  0.4522495 ],
       [ 0.3606766 ,  0.3606766 ,  0.3606766 ],
       [ 0.3773041 ,  0.3773041 ,  0.3773041 ],
       [ 0.47609289,  0.47609289,  0.47609289],
       [ 0.52876857,  0.52876857,  0.52876857],
       [ 0.56612499,  0.56612499,  0.56612499],
       [-0.67649074, -0.67649074, -0.67649074],
       [ 0.31716648,  0.31716648,  0.31716648],
       [ 0.41977716,  0.41977716,  0.41977716],
       [-2.82166956, -2.82166956, -2.82166956]])

In [39]:
normalized_data = normalize(scaled_data)
normalized_data

array([[ 0.57735027,  0.57735027,  0.57735027],
       [ 0.57735027,  0.57735027,  0.57735027],
       [ 0.57735027,  0.57735027,  0.57735027],
       [ 0.57735027,  0.57735027,  0.57735027],
       [ 0.57735027,  0.57735027,  0.57735027],
       [ 0.57735027,  0.57735027,  0.57735027],
       [-0.57735027, -0.57735027, -0.57735027],
       [ 0.57735027,  0.57735027,  0.57735027],
       [ 0.57735027,  0.57735027,  0.57735027],
       [-0.57735027, -0.57735027, -0.57735027]])

In [40]:
kmeans = KMeans(n_clusters=2, random_state=1).fit(normalized_data)
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1])

In [41]:

user_satisfaction_df.insert(0, 'satisfaction_cluster', kmeans.labels_)
user_satisfaction_df

Unnamed: 0_level_0,satisfaction_cluster,experience_score,experience_score,satisfaction_score
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
33626320000.0,0,1.945114,1.945114,1.945114
33614890000.0,0,1.939431,1.939431,1.939431
33625780000.0,0,1.940463,1.940463,1.940463
33659730000.0,0,1.946593,1.946593,1.946593
33675880000.0,0,1.949862,1.949862,1.949862
33760540000.0,0,1.95218,1.95218,1.95218
33667160000.0,1,1.87507,1.87507,1.87507
33603130000.0,0,1.936731,1.936731,1.936731
33604520000.0,0,1.943099,1.943099,1.943099
33627080000.0,1,1.741951,1.741951,1.741951


In [42]:

user_satisfaction_df['satisfaction_cluster'].value_counts()

0    8
1    2
Name: satisfaction_cluster, dtype: int64

In [48]:
# average satisfaction & experience score per cluster
#user_satisfaction_df.groupby('satisfaction_cluster').agg(
 #   {'satisfaction_score': 'mean', 'experience_score': 'mean'})

### Export dataframe to mysql

In [50]:
engine = create_engine('mysql+pymysql://root:@localhost/tellco')

In [None]:
# save the clustering model
with open("../models/user_experience.pkl", "wb") as f:
    pickle.dump(kmeans, f)