# Data Analysis
##### Analyzing the Pre-Processed Binary Encoded Data

Using standard python and pandas methods to examine and analyze the dataset

#### Imports

In [1]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

import pandas as pd
import matplotlib.pyplot as plt
# from mlxtend.frequent_patterns import apriori, association_rules

#### Data Loading

In [2]:
# Replace with the actual path to your CSV
df = pd.read_csv(r"data/processed_data.csv")

# Ensure all values are boolean / binary
df = df.astype(bool)

print("Dataset shape:", df.shape)
print("Sample transactions:")
df.head()

Dataset shape: (500, 71)
Sample transactions:


Unnamed: 0,Age_Teen,Age_Young_Adult,Age_Adult,Age_Mid_Adult,Location_India,Location_US,Location_Other,Gender_Female,Gender_Male,Gender_Other,...,Spend_100-500,Spend_500-1000,Spend_1000plus,Spend_Unknown,Reason_Fun,Reason_Stress_Relief,Reason_Skills_Competition,Reason_Socialize,Reason_Learning,Reason_Other
0,True,False,False,False,True,False,False,False,True,False,...,False,False,False,False,True,True,False,False,False,False
1,False,True,False,False,True,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
2,False,True,False,False,True,False,False,False,True,False,...,True,False,False,False,True,False,False,False,False,False
3,True,False,False,False,False,False,True,True,False,False,...,False,False,False,False,True,False,False,False,False,False
4,True,False,False,False,True,False,False,True,False,False,...,False,False,False,False,True,True,True,True,False,False


#### Let:
- $ I = \{ i_1, i_2, ... , i_m \} $ be the set of all items
- $ D = \{ T_1, T_2, ... , T_N \} $ be the transaction database
- $ N = | D | $ be the number of transactions

## Analysis

### Item supports

$ \text{count}(i) = \sum _{t=1}^N 1(i \in T) $

$ \text{support}(i) = \frac{\text{count(i)}}{\text{N}} $

In [3]:
freq_support = pd.DataFrame({
    "Frequency": df.sum(),
    "Support": df.mean()
}).sort_values("Frequency", ascending=False)

top_10 = freq_support.head(10).reset_index()
bottom_10 = freq_support.tail(10).reset_index()

top_10.rename(columns={"index": "Top 10 Items"}, inplace=True)
bottom_10.rename(columns={"index": "Bottom 10 Items"}, inplace=True)

combined = pd.concat([top_10, bottom_10], axis=1)

combined

Unnamed: 0,Top 10 Items,Frequency,Support,Bottom 10 Items,Frequency.1,Support.1
0,Reason_Fun,334,0.668,Favorite_Game_chess,20,0.04
1,Genre_Action/Adventure,298,0.596,Favorite_Game_clash_of_clans,20,0.04
2,Device_Mobile,294,0.588,Reason_Other,15,0.03
3,Gender_Female,268,0.536,Reason_Learning,12,0.024
4,Location_India,265,0.53,Gaming_Hours_0-1_hour,12,0.024
5,Reason_Stress_Relief,264,0.528,Device_Handheld,10,0.02
6,Discovery_Social_Media,263,0.526,Gender_Other,0,0.0
7,Game_Mode_Both,263,0.526,Favorite_Game_unknown,0,0.0
8,Genre_FPS,238,0.476,Game_Mode_Unknown,0,0.0
9,Gender_Male,232,0.464,Spend_Unknown,0,0.0


### Transaction lengths

$ \text{transaction length} = \sum_{j} X_{ij} $

In [4]:
transaction_lengths = df.sum(axis=1)

basket_size = transaction_lengths.describe().to_frame(name="Transaction Length/Basket Size")

basket_size

Unnamed: 0,Transaction Length/Basket Size
count,500.0
mean,15.668
std,2.552412
min,12.0
25%,14.0
50%,15.0
75%,17.0
max,24.0


### Sparsity/Density of the Dataset

$ \text{density} = \frac{\text{total ones}}{N \times M} $

In [5]:
total_entries = df.shape[0] * df.shape[1]
total_ones = df.values.sum()

density = (total_ones / total_entries)
print("Density:", density)

Density: 0.22067605633802817
