In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import statistics

In [None]:
DATA_DIR = "./data/"
OUT_DIR = "./out/"

file_app_usages = "App_usage_trace.txt"

In [None]:
def read_traces():
  x =[]
  with open(DATA_DIR+file_app_usages) as fin:
      for idx, line in enumerate(fin):
          splitLine = line.rstrip().split()
          if idx != 0:
              splitLine = np.array([int(i) for i in splitLine])
              x.append(splitLine)
            
  x = np.array(x, dtype="int")
  return x 

In [None]:
 data = read_traces()
 df = pd.DataFrame(data, columns= ["User_ID", "Timestamp", "BaseStation_ID", "App_ID", "TrafficInBytes"])
 df.describe()

Unnamed: 0,User_ID,Timestamp,BaseStation_ID,App_ID,TrafficInBytes
count,4171949.0,4171949.0,4171949.0,4171949.0,4171949.0
mean,689.5785,20160420000000.0,5029.56,206.4782,16236.34
std,291.5102,1970077.0,2923.542,370.332,129097.8
min,0.0,20160420000000.0,0.0,1.0,112.0
25%,531.0,20160420000000.0,2419.0,2.0,1159.0
50%,778.0,20160420000000.0,5007.0,29.0,2054.0
75%,942.0,20160430000000.0,7639.0,237.0,6214.0
max,999.0,20160430000000.0,9849.0,2000.0,24575790.0


# check the significance of combinations of features

In [None]:
comb_features = [["Timestamp", "BaseStation_ID", "TrafficInBytes"], ["Timestamp", "BaseStation_ID"], ["Timestamp", "TrafficInBytes"], ["BaseStation_ID", "TrafficInBytes"]]
for features in comb_features: 
  print(features)
  res = df.groupby(features)['App_ID'].value_counts() / df.groupby(features)['App_ID'].count()
  total_score = 0

  for attr, score in res.items():
    total_score += score

  print(total_score / len(res))

['Timestamp', 'BaseStation_ID', 'TrafficInBytes']
0.9987998415129236
['Timestamp', 'BaseStation_ID']
0.890128037451361
['Timestamp', 'TrafficInBytes']
0.9970888979409638
['BaseStation_ID', 'TrafficInBytes']
0.8658570959418311


# some stats about the distribution of apps and user

In [None]:
entries_per_app = df.groupby("App_ID").count()

print(f"Mean entries per app: {round(entries_per_app.mean()['Timestamp'], 2)}")

entries_per_users = df.groupby("User_ID").count()
mean_entries = entries_per_users["Timestamp"].mean()
max_entries = entries_per_users["Timestamp"].max()
print(f"Mean entries per user: {round(mean_entries, 2)}")
print(f"Max entries per user: {max_entries}")

apps_per_user = df[["User_ID","App_ID"]].value_counts()
mean_apps = apps_per_user.mean()
min_apps = apps_per_user.min()
max_apps = apps_per_user.max()
print(f"Mean apps per user: {round(mean_apps,2)}")
print(f"Min apps per user: {min_apps}")
print(f"Max apps per user: {max_apps}")

Mean entries per app: 2459.88
Mean entries per user: 4789.84
Max entries per user: 1098748
Mean apps per user: 77.7
Min apps per user: 1
Max apps per user: 289258


# Analyzing the 90% quantile of the most popular apps
calculate which app have the most rows <break>

get 90% quantile of top apps

In [None]:
counted_apps = df.loc[:, ["App_ID"]].value_counts()
print(f"10 most common apps:\n\n {counted_apps[:10]}\n")
q = 0.9
quantile = np.quantile(counted_apps.values, q)
print(f"Quantile threshold: {quantile}")
croped = counted_apps.loc[counted_apps.values >= quantile, :]
top_apps = [i[0] for i in croped.index]
print(f"# Top Apps in {q}%-Quantile: {len(top_apps)}")

10 most common apps:

 App_ID
2         898308
1         363187
5         215898
10        105502
4          85708
9          83435
77         54087
29         52220
23         48687
237        46349
dtype: int64

Quantile threshold: 2752.5
# Top Apps in 0.9%-Quantile: 170


In [None]:
df_top_apps = df.loc[df["App_ID"].isin(top_apps)]

print(f"Discarded rows: {len(df)- len(df_top_apps)}")
print(f"Remaining Rows: {len(df_top_apps)}")

Discarded rows: 781982
Remaining Rows: 3389967


# Comparison of unique values of features

In [None]:
print("Original Dataset:")
unique_apps = len(df["App_ID"].unique())
print(f"Unique Apps: {unique_apps}")

unique_user = len(df["User_ID"].unique())
print(f"User: {unique_user}")

unique_bs = len(df["BaseStation_ID"].unique())
print(f"Unique base stations: {unique_bs}")

print("\nTop Apps:")
unique_apps = len(df_top_apps["App_ID"].unique())
print(f"Unique Apps: {unique_apps}")

unique_user = len(df_top_apps["User_ID"].unique())
print(f"User: {unique_user}")

unique_user = len(df_top_apps["User_ID"].unique())
print(f"User: {unique_user}")

unique_bs = len(df_top_apps["BaseStation_ID"].unique())
print(f"Unique base stations: {unique_bs}")


entries_per_users = df_top_apps.groupby("User_ID").count()
mean_entries = entries_per_users["Timestamp"].mean()
max_entries = entries_per_users["Timestamp"].max()
print(f"Mean entries per user: {round(mean_entries, 2)}")
print(f"Max entries per user: {max_entries}")

entries_per_app = df_top_apps.groupby("App_ID").count()
print(f"Mean entries per app {round(entries_per_app.mean()['Timestamp'], 2)}")

Original Dataset:
Unique Apps: 1696
User: 871
Unique base stations: 6739

Top Apps:
Unique Apps: 170
User: 870
User: 870
Unique base stations: 6666
Mean entries per user: 3896.51
Max entries per user: 907624
Mean entries per app 19940.98


In [None]:
# min, max, mean, std, var
print("Original dataset")
print(counted_apps.describe())
print("\n")

print("Top Apps:")
df_apps_top = df_top_apps["App_ID"].value_counts()
print(df_apps_top.describe())


Original dataset
count      1696.000000
mean       2459.875590
std       24721.818759
min           1.000000
25%         113.750000
50%         353.000000
75%         978.500000
max      898308.000000
dtype: float64


Top Apps:
count       170.000000
mean      19940.982353
std       76058.959982
min        2798.000000
25%        3807.500000
50%        6081.000000
75%       11853.250000
max      898308.000000
Name: App_ID, dtype: float64


In [None]:

agg = df_top_apps.loc[:, ["User_ID", "App_ID"]].groupby(["User_ID"]).nunique()
agg = agg.sort_values(by=['App_ID'], ascending=False)
agg_quantile = np.quantile(agg.values, 0.95)
print(agg_quantile)

#df_top_apps[["User_ID","App_ID"]].value_counts().index

58.0


User_ID  App_ID
942      2         289258
         1          96026
772      2          94783
942      5          61376
832      2          38939
627      213        32008
942      9          25468
         10         25392
         29         24057
         4          23782
dtype: int64