In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

plt.style.use('ggplot')

# Ensure the CSV is in the same directory
FILENAME = 'mobile_game_inapp_purchases.csv'

In [None]:
def load_and_preprocess(filepath):
    print("--- Step 1 & 2: Loading and Preprocessing ---")

    try:
        df = pd.read_csv(filepath)
        print(f"Original Dataset Shape: {df.shape}")
    except FileNotFoundError:
        print(f"Error: '{filepath}' not found.")
        return None, None

    # Handle Missing Values (Added .copy() to fix warning)
    df_clean = df.dropna(subset=['Age', 'AverageSessionLength', 'Gender', 'Device', 'SpendingSegment']).copy()
    print(f"Shape after cleaning: {df_clean.shape}")

    # Feature Engineering
    df_clean['Age_Group'] = pd.cut(df_clean['Age'], bins=[0, 18, 25, 35, 100],
                                   labels=['Age:<18', 'Age:18-24', 'Age:25-34', 'Age:35+'])

    df_clean['Session_Bin'] = pd.qcut(df_clean['AverageSessionLength'], q=3,
                                      labels=['Session:Short', 'Session:Medium', 'Session:Long'])

    # Create Transaction Baskets
    basket_cols = ['Device', 'Gender', 'SpendingSegment', 'PaymentMethod', 'GameGenre', 'Age_Group', 'Session_Bin']

    transactions = []
    for i, row in df_clean.iterrows():
        basket = []
        for col in basket_cols:
            if pd.notna(row[col]):
                basket.append(f"{col}={row[col]}")
        transactions.append(basket)

    print(f"Dataset loaded. Total Transactions: {len(transactions)}")
    return df_clean, transactions

In [None]:
def perform_eda(transactions):
    print("\n--- Step 3: Data Exploration (EDA) ---")

    all_items = [item for sublist in transactions for item in sublist]
    item_counts = pd.Series(all_items).value_counts()

    # Plot 1: Item Frequency
    plt.figure(figsize=(10, 6))
    item_counts.head(15).sort_values().plot(kind='barh', color='teal')
    plt.title('Top 15 Most Frequent Attributes')
    plt.xlabel('Frequency')
    plt.tight_layout()
    plt.show()

    # Plot 2: Basket Size Distribution
    transaction_sizes = [len(t) for t in transactions]
    plt.figure(figsize=(8, 5))
    plt.hist(transaction_sizes, bins=range(min(transaction_sizes), max(transaction_sizes) + 2),
             align='left', rwidth=0.8, color='orange')
    plt.title('Distribution of Basket Sizes')
    plt.xlabel('Number of Items per User')
    plt.ylabel('Count')
    plt.show()

    # Plot 3: Co-occurrence Heatmap
    print("Generating Heatmap...")
    top_10 = item_counts.head(10).index.tolist()
    matrix = pd.DataFrame(0, index=top_10, columns=top_10)

    for basket in transactions:
        basket_top = [item for item in basket if item in top_10]
        for item1, item2 in itertools.combinations(basket_top, 2):
            matrix.loc[item1, item2] += 1
            matrix.loc[item2, item1] += 1

    plt.figure(figsize=(10, 8))
    sns.heatmap(matrix, annot=True, fmt='d', cmap='Blues')
    plt.title('Co-occurrence Heatmap (Top 10 Items)')
    plt.tight_layout()
    plt.show()

In [None]:
def generate_association_rules(transactions, min_support=0.03, min_confidence=0.20):
    print("\n--- Step 4: Association Rule Mining ---")

    N = len(transactions)
    item_counts = {}
    pair_counts = {}

    # Frequent 1-Itemsets
    for basket in transactions:
        for item in basket:
            item_counts[item] = item_counts.get(item, 0) + 1
    frequent_items = {k: v for k, v in item_counts.items() if v/N >= min_support}

    # Frequent 2-Itemsets
    for basket in transactions:
        basket_freq = [item for item in basket if item in frequent_items]
        for pair in itertools.combinations(sorted(basket_freq), 2):
            pair_counts[pair] = pair_counts.get(pair, 0) + 1
    frequent_pairs = {k: v for k, v in pair_counts.items() if v/N >= min_support}

    # Generate Rules & Metrics
    rules = []
    for pair, count_AB in frequent_pairs.items():
        item_A, item_B = pair
        support_AB = count_AB / N
        support_A = frequent_items[item_A] / N
        support_B = frequent_items[item_B] / N

        def add_rule(antecedent, consequent, supp_ant, supp_cons):
            confidence = support_AB / supp_ant
            if confidence >= min_confidence:
                lift = confidence / supp_cons
                leverage = support_AB - (supp_ant * supp_cons)
                conviction = float('inf') if confidence == 1 else (1 - supp_cons) / (1 - confidence)

                rules.append({
                    'Antecedent': antecedent,
                    'Consequent': consequent,
                    'Support': round(support_AB, 4),
                    'Confidence': round(confidence, 4),
                    'Lift': round(lift, 4),
                    'Leverage': round(leverage, 4),
                    'Conviction': round(conviction, 4)
                })

        add_rule(item_A, item_B, support_A, support_B)
        add_rule(item_B, item_A, support_B, support_A)

    rules_df = pd.DataFrame(rules)

    if not rules_df.empty:
        rules_df = rules_df.sort_values(by='Lift', ascending=False)
        rules_df.to_csv('association_rules.csv', index=False)
        print(f"Success: {len(rules_df)} rules found. Saved to 'association_rules.csv'.")
        return rules_df
    else:
        print("No rules found. Try lowering thresholds.")
        return pd.DataFrame()

In [None]:
def plot_metrics(rules_df):
    print("\n--- Step 5: Evaluation Visualization ---")
    plt.figure(figsize=(10, 6))

    sns.scatterplot(
        data=rules_df,
        x="Support",
        y="Confidence",
        hue="Lift",
        size="Lift",
        sizes=(20, 200),
        palette="viridis"
    )
    plt.title('Association Rules: Support vs Confidence (Color = Lift)')
    plt.xlabel('Support (Frequency)')
    plt.ylabel('Confidence (Reliability)')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
# 1. Load Data
df, transactions = load_and_preprocess(FILENAME)

if transactions:
    # 2. Run EDA
    perform_eda(transactions)

    # 3. Mine Rules
    rules_df = generate_association_rules(transactions, min_support=0.03, min_confidence=0.20)

    # 4. Visualize Results
    if not rules_df.empty:
        plot_metrics(rules_df)
        print("\nTop 10 Rules by Lift:")
        print(rules_df.head(10).to_string(index=False))