## User Analysis Overview

***Importing Important Libraries***

In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define the path to the src directory
src_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
sys.path.insert(0, src_dir)

if 'load_data' in sys.modules:
    del sys.modules['load_data']
if 'user_overview' in sys.modules:
    del sys.modules['user_eda']

from load_data import DataLoader
from user_eda import UserOverview

### Load data from PostgreSQL and create a UserOverview Object

In [None]:
loader = DataLoader()
df = loader.load_data("SELECT * FROM public.xdr_data")

cleaned_df = loader.clean_data(df)

df = cleaned_df

user_eda = UserOverview(df)

### Summary Statistics

In [None]:
statistics = user_eda.describe_dataset()
statistics

### Top Handsets and Manufacturers


In [None]:
user_eda.plot_top_handset_types()

In [None]:
user_eda.plot_top_handset_manufacturers()

In [None]:
user_eda.plot_top_handsets_per_manufacturer()

### xDR Aggregates 

In [None]:
columns_to_aggregate = ['Bearer Id', 'Dur.(s)', 'Total UL (Bytes)', 'Total DL (Bytes)']

sessions = df.groupby('MSISDN/Number')[columns_to_aggregate].agg({
    'Bearer Id': 'count',
    'Dur.(s)': ['min', 'max', 'mean', 'sum'],
    'Total UL (Bytes)': ['min', 'max', 'mean', 'sum'],
    'Total DL (Bytes)': ['min', 'max', 'mean', 'sum']
}).reset_index()


sessions


In [None]:
mean_values = {
    'Mean Dur.(s)': round(float(sessions[('Dur.(s)', 'mean')].mean()), 2),
    'Mean Total UL (Bytes)': round(float(sessions[('Total UL (Bytes)', 'mean')].mean()), 2),
    'Mean Total DL (Bytes)': round(float(sessions[('Total DL (Bytes)', 'mean')].mean()), 2),
    'Mean Bearer Id Count': round(float(sessions[('Bearer Id', 'count')].mean()), 2)
}

print(mean_values)



In [None]:
# Aggregates per user
aggregates_per_users = df.groupby('MSISDN/Number').agg({
    # Total or average session duration
    'Dur.(s)': 'sum',
    
    # Summing up the columns listed in columns_to_sum
    **{col: 'sum' for col in columns_media}
}).reset_index()
                 

In [None]:
# Add a new column for total application data volume (in Bytes) for each session per user
aggregates_per_users['Total Data Volume (Bytes)'] = aggregates_per_users[
    [col for col in columns_media if 'DL' in col or 'UL' in col]
].sum(axis=1)

# Display the aggregated DataFrame
aggregates_per_users.head()

## Decile top 5

In [None]:
decile_aggregates = user_eda.segment_and_compute_decile()
print(decile_aggregates)

In [None]:
# Plot
plt.figure(figsize=(10, 6))
bar_width = 0.25
opacity = 0.8

# Set positions for bars
index = range(len(decile_aggregates))

plt.bar(index, decile_aggregates['Total DL (Bytes)'], bar_width, alpha=opacity, color='b', label='Total DL (Bytes)')
plt.bar([i + bar_width for i in index], decile_aggregates['Total UL (Bytes)'], bar_width, alpha=opacity, color='g', label='Total UL (Bytes)')
plt.bar([i + 2 * bar_width for i in index], decile_aggregates['Total Data (Bytes)'], bar_width, alpha=opacity, color='r', label='Total Data (Bytes)')

plt.xlabel('Decile')
plt.ylabel('Bytes')
plt.title('Total Bytes by Decile')
plt.xticks([i + bar_width for i in index], decile_aggregates['Decile'])
plt.legend()

plt.tight_layout()
plt.show()

### Univariate Non-Graphical

In [None]:
uvt_analysis = user_eda.univariate_analysis()
uvt_analysis

### Univariate Analysis Graphical

In [None]:
user_eda.graphical_univariate_analysis()

### Bivariant Analysis

In [None]:
bvt_analysis = user_eda.bivariate_analysis()
bvt_analysis

### Correlation Matrix

In [None]:
corr_matrix = user_eda.plot_correlation_matrix()

### PCA Analysis

In [None]:
pca_analysis = user_eda.pca_analysis()
pca_analysis