In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np

# Assuming df_xdr is the DataFrame containing the dataset
df_xdr['Total DL (Bytes)'] = pd.to_numeric(df_xdr['Total DL (Bytes)'], errors='coerce').fillna(0)
df_xdr['Total UL (Bytes)'] = pd.to_numeric(df_xdr['Total UL (Bytes)'], errors='coerce').fillna(0)
df_xdr['Dur. (ms)'] = pd.to_numeric(df_xdr['Dur. (ms)'], errors='coerce').fillna(df_xdr['Dur. (ms)'].mean())

# Convert session duration from milliseconds to seconds
df_xdr['Dur. (s)'] = df_xdr['Dur. (ms)'] / 1000.0

# Calculate total data transferred (DL + UL)
df_xdr['Total Data (Bytes)'] = df_xdr['Total DL (Bytes)'] + df_xdr['Total UL (Bytes)']

# Calculate throughput in bytes per second
df_xdr['Throughput (Bps)'] = df_xdr['Total Data (Bytes)'] / df_xdr['Dur. (s)']
# Aggregate the required information per customer
user_experience = df_xdr.groupby('MSISDN/Number').agg({
    'TCP DL Retrans. Vol (Bytes)': 'mean', #Average TCP DL retransmission
    'Avg RTT DL (ms)': 'mean',                 # Average RTT DL

    'Handset Type': lambda x: x.mode()[0],  # Mode of Handset Type
    'Throughput (Bps)': 'mean'           # Average throughput
}).reset_index()

# Handle missing values by replacing with column mean or mode
user_experience.fillna({
    'TCP DL Retrans. Vol (Bytes)': user_experience['TCP DL Retrans. Vol (Bytes)'].mean(),
    'Avg RTT DL (ms)': user_experience['Avg RTT DL (ms)'].mean(),
    'Throughput (Bps)': user_experience['Throughput (Bps)'].mean(),
}, inplace=True)

# Checking for outliers and replacing with mean
for col in ['TCP DL Retrans. Vol (Bytes)', 'Avg RTT DL (ms)','Throughput (Bps)']:
    col_mean = user_experience[col].mean()
    col_std = user_experience[col].std()
    user_experience[col] = np.where(
        (user_experience[col] < col_mean - 3 * col_std) | (user_experience[col] > col_mean + 3 * col_std),
        col_mean,
        user_experience[col]
    )

print("\nAggregated User Experience Data:")
print(user_experience.head())



Aggregated User Experience Data:
   MSISDN/Number  TCP DL Retrans. Vol (Bytes)  Avg RTT DL (ms)  \
0   3.360100e+10                 1.685339e+07        46.000000   
1   3.360100e+10                 1.685339e+07        30.000000   
2   3.360100e+10                 1.685339e+07       119.182869   
3   3.360101e+10                 1.066000e+03        69.000000   
4   3.360101e+10                 9.349630e+06        57.000000   

                     Handset Type  Throughput (Bps)  
0  Huawei P20 Lite Huawei Nova 3E      7.528192e+06  
1          Apple iPhone 7 (A1778)      8.655280e+05  
2                       undefined      4.415580e+06  
3         Apple iPhone 5S (A1457)      8.467074e+06  
4         Apple iPhone Se (A1723)      8.437485e+06  


In [19]:
# Compute TCP values
top_10_tcp_dl = df_xdr['TCP DL Retrans. Vol (Bytes)'].nlargest(10)
bottom_10_tcp_dl = df_xdr['TCP DL Retrans. Vol (Bytes)'].nsmallest(10)
most_frequent_tcp_dl = df_xdr['TCP DL Retrans. Vol (Bytes)'].mode().head(10)

print("\nTop 10 TCP DL Values:")
print(top_10_tcp_dl)

print("\nBottom 10 TCP DL Values:")
print(bottom_10_tcp_dl)

print("\nMost Frequent TCP DL Values:")
print(most_frequent_tcp_dl)



Top 10 TCP DL Values:
77979     4.294426e+09
227991    4.294426e+09
377989    4.294426e+09
135690    4.291380e+09
285689    4.291380e+09
435682    4.291380e+09
34645     4.289877e+09
184664    4.289877e+09
334663    4.289877e+09
140825    4.289488e+09
Name: TCP DL Retrans. Vol (Bytes), dtype: float64

Bottom 10 TCP DL Values:
59717     2.0
125101    2.0
209730    2.0
275103    2.0
359731    2.0
425099    2.0
2850      4.0
39610     4.0
52488     4.0
60376     4.0
Name: TCP DL Retrans. Vol (Bytes), dtype: float64

Most Frequent TCP DL Values:
0    1330.0
Name: TCP DL Retrans. Vol (Bytes), dtype: float64


In [20]:
# Compute RTT values using the DL data
top_10_rtt = df_xdr['Avg RTT DL (ms)'].nlargest(10)
bottom_10_rtt = df_xdr['Avg RTT DL (ms)'].nsmallest(10)
most_frequent_rtt = df_xdr['Avg RTT DL (ms)'].mode().head(10)

print("\nTop 10 RTT Values:")
print(top_10_rtt)

print("\nBottom 10 RTT Values:")
print(bottom_10_rtt)

print("\nMost Frequent RTT Values:")
print(most_frequent_rtt)



Top 10 RTT Values:
30166     96923.0
180186    96923.0
330187    96923.0
29927     64640.0
179947    64640.0
329948    64640.0
17910     55811.0
167932    55811.0
317931    55811.0
5989      54847.0
Name: Avg RTT DL (ms), dtype: float64

Bottom 10 RTT Values:
42612     0.0
52225     0.0
60152     0.0
61152     0.0
61345     0.0
103328    0.0
124551    0.0
142538    0.0
143890    0.0
192631    0.0
Name: Avg RTT DL (ms), dtype: float64

Most Frequent RTT Values:
0    28.0
Name: Avg RTT DL (ms), dtype: float64


In [22]:
# Compute Throughput values
top_10_throughput = df_xdr['Throughput (Bps)'].nlargest(10)
bottom_10_throughput = df_xdr['Throughput (Bps)'].nsmallest(10)
most_frequent_throughput = df_xdr['Throughput (Bps)'].mode().head(10)

print("\nTop 10 Throughput Values:")
print(top_10_throughput)

print("\nBottom 10 Throughput Values:")
print(bottom_10_throughput)

print("\nMost Frequent Throughput Values:")
print(most_frequent_throughput)



Top 10 Throughput Values:
24994     1.200441e+08
175014    1.200441e+08
325016    1.200441e+08
24692     1.171029e+08
174712    1.171029e+08
324714    1.171029e+08
24695     1.152741e+08
174715    1.152741e+08
324717    1.152741e+08
24376     1.136658e+08
Name: Throughput (Bps), dtype: float64

Bottom 10 Throughput Values:
150013        0.000000
300013        0.000000
450002        0.000000
100015    54775.569490
250026    54775.569490
400023    54775.569490
25026     83790.653188
175046    83790.653188
325048    83790.653188
25074     89025.857083
Name: Throughput (Bps), dtype: float64

Most Frequent Throughput Values:
0         0.000000
1     54775.569490
2     83790.653188
3     89025.857083
4     91426.180336
5     93560.778325
6     94473.518004
7     97026.648884
8    101108.476266
9    103263.812872
Name: Throughput (Bps), dtype: float64


In [40]:
# Aggregate average throughput per handset type
throughput_per_handset = df_xdr.groupby('Handset Type').agg({'Throughput (Bps)': 'mean'}).reset_index()

print("\nAverage Throughput Per Handset Type:")
print(throughput_per_handset)



Average Throughput Per Handset Type:
                                           Handset Type  Throughput (Bps)
0                            A-Link Telecom I. Cubot A5      5.598306e+07
1                     A-Link Telecom I. Cubot Note Plus      4.897298e+07
2                        A-Link Telecom I. Cubot Note S      1.683681e+07
3                          A-Link Telecom I. Cubot Nova      8.343153e+06
4                         A-Link Telecom I. Cubot Power      1.115560e+07
...                                                 ...               ...
1391  Zte Zte Blade C2 Smartphone Android By Sfr Sta...      4.659338e+06
1392                          Zyxel Communicat. Lte7460      7.861414e+06
1393                          Zyxel Communicat. Sbg3600      7.600364e+06
1394                    Zyxel Communicat. Zyxel Wah7706      1.259624e+06
1395                                          undefined      6.871043e+06

[1396 rows x 2 columns]


In [38]:
# Aggregate average TCP retransmission per handset type
tcp_per_handset = df_xdr.groupby('Handset Type').agg({'TCP DL Retrans. Vol (Bytes)': 'mean'}).reset_index()
print("\nAverage TCP Retransmission Per Handset Type:")
print(tcp_per_handset)



Average TCP Retransmission Per Handset Type:
                                           Handset Type  \
0                            A-Link Telecom I. Cubot A5   
1                     A-Link Telecom I. Cubot Note Plus   
2                        A-Link Telecom I. Cubot Note S   
3                          A-Link Telecom I. Cubot Nova   
4                         A-Link Telecom I. Cubot Power   
...                                                 ...   
1391  Zte Zte Blade C2 Smartphone Android By Sfr Sta...   
1392                          Zyxel Communicat. Lte7460   
1393                          Zyxel Communicat. Sbg3600   
1394                    Zyxel Communicat. Zyxel Wah7706   
1395                                          undefined   

      TCP DL Retrans. Vol (Bytes)  
0                             NaN  
1                    6.023490e+05  
2                    4.134448e+07  
3                    1.358400e+05  
4                    7.799000e+03  
...                          

In [41]:
# Prepare data for clustering
features = ['TCP DL Retrans. Vol (Bytes)', 'Avg RTT DL (ms)','Throughput (Bps)']
scaler = StandardScaler()
experience_data_normalized = scaler.fit_transform(user_experience[features])

# Run K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=0)
user_experience['Cluster'] = kmeans.fit_predict(experience_data_normalized)

# Compute statistics for each cluster
cluster_description = user_experience.groupby('Cluster').agg({
    'TCP DL Retrans. Vol (Bytes)': ['mean', 'std'],
    'Avg RTT DL (ms)': ['mean', 'std'],
    'Throughput (Bps)': ['mean', 'std']
}).reset_index()

print("\nCluster Descriptions:")
print(cluster_description)



Cluster Descriptions:
  Cluster TCP DL Retrans. Vol (Bytes)               Avg RTT DL (ms)  \
                                 mean           std            mean   
0       0                1.390743e+06  2.301436e+06       77.821307   
1       1                1.918680e+07  1.672211e+07       72.591274   
2       2                6.426496e+06  8.890338e+06      212.939956   

              Throughput (Bps)                
          std             mean           std  
0   75.445496     6.883530e+06  3.545170e+06  
1   57.556819     4.629620e+06  3.004231e+06  
2  342.176491     2.182357e+07  7.601144e+06  


In [3]:
# Import necessary libraries
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd


# Aggregate data for plotting
throughput_per_handset = df_xdr.groupby('Handset Type').agg({'Throughput (Bps)': 'mean'}).reset_index()
top_handsets = df_xdr['Handset Type'].value_counts().head(10).reset_index()
top_handsets.columns = ['Handset Type', 'Count']

# Initialize Dash app
app = dash.Dash(__name__)

# Define the layout of the dashboard
app.layout = html.Div([
    html.H1('Telecom Data Insights Dashboard', style={'textAlign': 'center', 'color': '#1f77b4'}),

    html.Div([
        html.H2('Average Throughput by Handset Type', style={'marginBottom': '0'}),
        dcc.Graph(
            id='throughput-handset',
            figure=px.bar(
                throughput_per_handset,
                x='Handset Type',
                y='Throughput (Bps)',
                title='Average Throughput by Handset Type',
                labels={'Handset Type': 'Handset Type', 'Throughput (Bps)': 'Average Throughput (Bps)'}
            ).update_layout(
                xaxis_title='Handset Type',
                yaxis_title='Average Throughput (Bps)',
                xaxis_tickangle=-45,
                title_font_size=20,
                xaxis_title_font_size=14,
                yaxis_title_font_size=14
            )
        )
    ], style={'padding': '10px'}),

    html.Div([
        html.H2('Top 10 Handsets Used by Customers', style={'marginBottom': '0'}),
        dcc.Graph(
            id='top-handsets',
            figure=px.bar(
                top_handsets,
                x='Handset Type',
                y='Count',
                title='Top 10 Handsets Used by Customers',
                labels={'Handset Type': 'Handset Type', 'Count': 'Count'}
            ).update_layout(
                xaxis_title='Handset Type',
                yaxis_title='Count',
                xaxis_tickangle=-45,
                title_font_size=20,
                xaxis_title_font_size=14,
                yaxis_title_font_size=14
            )
        )
    ], style={'padding': '10px'}),

    html.Div([
        html.H2('Interactive Elements', style={'marginBottom': '0'}),
        html.Label('Select Handset Type:', style={'fontSize': '16px'}),
        dcc.Dropdown(
            id='handset-dropdown',
            options=[{'label': i, 'value': i} for i in throughput_per_handset['Handset Type']],
            value=throughput_per_handset['Handset Type'].iloc[0],
            style={'width': '50%'}
        ),
        dcc.Graph(id='selected-handset-throughput')
    ], style={'padding': '10px'})
])

# Define callback to update graph based on dropdown selection
@app.callback(
    Output('selected-handset-throughput', 'figure'),
    [Input('handset-dropdown', 'value')]
)
def update_graph(selected_handset):
    filtered_df = throughput_per_handset[throughput_per_handset['Handset Type'] == selected_handset]
    return px.bar(
        filtered_df,
        x='Handset Type',
        y='Throughput (Bps)',
        title=f'Average Throughput for {selected_handset}',
        labels={'Handset Type': 'Handset Type', 'Throughput (Bps)': 'Average Throughput (Bps)'}
    ).update_layout(
        xaxis_title='Handset Type',
        yaxis_title='Average Throughput (Bps)',
        title_font_size=20,
        xaxis_title_font_size=14,
        yaxis_title_font_size=14
    )

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)


KeyError: 'Handset Type'