<a href="https://colab.research.google.com/github/Subbu-chowdary/ShopSift/blob/main/ShopSift.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary packages
!pip install numpy pandas faker plotly scikit-learn tqdm

# Now you can run your script
import numpy as np
import pandas as pd
from faker import Faker
import random
import plotly.express as px
from itertools import combinations
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore", category=DeprecationWarning)

np.random.seed(32)
random.seed(32)

fake = Faker()

def generate_data(num_products=10, num_customers=100, num_transactions=500):
    products = [fake.word() for _ in range(num_products)]
    transactions = []
    for _ in range(num_transactions):
        customer_id = random.randint(1, num_customers)
        basket_size = random.randint(1, 5)
        basket = random.sample(products, basket_size)
        transactions.append({
            'customer_id': customer_id,
            'products': basket
        })
    df = pd.DataFrame(transactions)
    df_encoded = df.explode('products').pivot_table(
        index='customer_id',
        columns='products',
        aggfunc=lambda x: 1,
        fill_value=0
    )
    return df_encoded

def simple_apriori(df, min_support=0.1, min_confidence=0.5):
    def support(item_set):
        return (df[list(item_set)].sum(axis=1) == len(item_set)).mean()

    items = set(df.columns)
    item_sets = [frozenset([item]) for item in items]
    rules = []

    for k in range(2, len(items) + 1):
        item_sets = [s for s in combinations(items, k) if support(s) >= min_support]

        for item_set in item_sets:
            item_set = frozenset(item_set)
            for i in range(1, len(item_set)):
                for antecedent in combinations(item_set, i):
                    antecedent = frozenset(antecedent)
                    consequent = item_set - antecedent
                    confidence = support(item_set) / support(antecedent)
                    if confidence >= min_confidence:
                        lift = confidence / support(consequent)
                        rules.append({
                            'antecedent': ','.join(antecedent),
                            'consequent': ','.join(consequent),
                            'support': support(item_set),
                            'confidence': confidence,
                            'lift': lift
                        })
                        if len(rules) >= 10:
                            break
        if len(rules) >= 10:
            break

    return pd.DataFrame(rules).sort_values('lift', ascending=False)

def perform_kmeans_with_progress(df, n_clusters=3, update_interval=5):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)

    kmeans = KMeans(n_clusters=n_clusters, random_state=32, max_iter=100)

    with tqdm(total=kmeans.max_iter, desc="k-means Clustering") as pbar:
        for i in range(kmeans.max_iter):
            kmeans.fit(df_scaled)
            pbar.update(1)

            if i % update_interval == 0:
                yield kmeans.labels_
            if kmeans.n_iter_ <= i + 1:
                break

    return kmeans.labels_

def visualize_apriori_rules(rules, top_n=10):
    top_rules = rules.head(top_n)
    fig = px.scatter_3d(
        top_rules, x="support", y="confidence", z="lift",
        color="lift", size="support",
        hover_name="antecedent", hover_data={"consequent": True},
        labels={"support": "Support", "confidence": "Confidence", "lift": "Lift"},
        title=f"Top {top_n} Association Rules"
    )
    print("Generating Apriori HTML visualization...")
    fig.write_html("apriori3d.html")
    print("Apriori rules visualization saved as 'apriori3d.html'.")
    return fig

def visualize_kmeans_clusters(df, cluster_labels):
    pca = PCA(n_components=3)
    pca_result = pca.fit_transform(df)
    fig = px.scatter_3d(
        x=pca_result[:, 0], y=pca_result[:, 1], z=pca_result[:, 2],
        color=cluster_labels,
        title="Customer Cluster Visualization"
    )
    return fig

def main():
    print("Gathering synthetic data....")
    df_encoded = generate_data(num_products=10, num_customers=100, num_transactions=500)
    print(f"Dataset shape: {df_encoded.shape}")

    print("Performing Apriori algorithm...")
    rules = simple_apriori(df_encoded, min_support=0.1, min_confidence=0.5)

    if not rules.empty:
        print(f"Apriori algorithm complete, found {len(rules)} rules")
        visualize_apriori_rules(rules)
    else:
        print("No rules found with the given support and confidence levels.")

    print("Performing k-means clustering...")
    kmeans_generator = perform_kmeans_with_progress(df_encoded, n_clusters=3, update_interval=5)

    for i, labels in enumerate(kmeans_generator):
        print(f"k-means iteration {i * 5}")
        viz = visualize_kmeans_clusters(df_encoded, labels)
        html_filename = f"customer_clusters_3d_step_{i}.html"
        viz.write_html(html_filename)
        print(f"Intermediate visualization saved as '{html_filename}'")

    final_labels = labels
    print("k-means clustering completed.")
    final_viz = visualize_kmeans_clusters(df_encoded, final_labels)
    final_viz.write_html("customer_cluster_3d_final.html")
    print("Final customer cluster visualization saved as 'customer_cluster_3d_final.html'.")
    print("Analysis completed.")

if __name__ == "__main__":
    main()


Collecting faker
  Downloading Faker-28.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-28.1.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-28.1.0
Gathering synthetic data....
Dataset shape: (99, 10)
Performing Apriori algorithm...
Apriori algorithm complete, found 50 rules
Generating Apriori HTML visualization...
Apriori rules visualization saved as 'apriori3d.html'.
Performing k-means clustering...




k-means Clustering:   1%|          | 1/100 [00:00<00:15,  6.50it/s]

k-means iteration 0










k-means Clustering:   5%|▌         | 5/100 [00:00<00:09, 10.47it/s]

Intermediate visualization saved as 'customer_clusters_3d_step_0.html'
k-means clustering completed.





Final customer cluster visualization saved as 'customer_cluster_3d_final.html'.
Analysis completed.
