In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.43.2-py2.py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler ,MinMaxScaler
from datetime import datetime
import streamlit as st


In [None]:
from google.colab import files
uploaded = files.upload()


Saving data.csv to data.csv


In [None]:
# Load Data
def load_data():
    df = pd.read_csv("data.csv",encoding='latin1')
    df = df[df['CustomerID'].notnull()]
    df = df[df['Quantity'] > 0]
    return df

In [None]:
# Preprocess Data
def preprocess_data(df):
    df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    snapshot_date = df['InvoiceDate'].max() + pd.DateOffset(days=1)
    return df, snapshot_date


In [None]:
# Calculate RFM Scores
def calculate_rfm(df, snapshot_date):
    rfm = df.groupby('CustomerID').agg({
        'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
        'InvoiceNo': 'nunique',
        'TotalPrice': 'sum'
    })
    rfm.columns = ['Recency', 'Frequency', 'Monetary']
    return rfm

In [None]:
# Function to compute RFM scores (1-5 scale)
def compute_rfm_scores(df):
    df["Recency_Score"] = pd.qcut(df["Recency"], 5, labels=[5, 4, 3, 2, 1])
    df["Frequency_Score"] = pd.qcut(df["Frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])
    df["Monetary_Score"] = pd.qcut(df["Monetary"], 5, labels=[1, 2, 3, 4, 5])
    df["RFM_Score"] = df["Recency_Score"].astype(int) + df["Frequency_Score"].astype(int) + df["Monetary_Score"].astype(int)
    return df


In [None]:
# Function to classify customers into segments
def categorize_rfm_segments(df):
    conditions = [
        (df["RFM_Score"] >= 12),  # Champions
        (df["RFM_Score"] >= 10) & (df["Recency_Score"] >= 4),  # Loyal
        (df["RFM_Score"] >= 9),  # Potential Loyalist
        (df["Recency_Score"] >= 4) & (df["Frequency_Score"] <= 2),  # Recent
        (df["Recency_Score"] >= 3) & (df["Monetary_Score"] <= 2),  # Promising
        (df["RFM_Score"] >= 6),  # Needs Attention
        (df["Recency_Score"] <= 3) & (df["Frequency_Score"] <= 3),  # About to Sleep
        (df["Recency_Score"] <= 2) & (df["Frequency_Score"] <= 2),  # At Risk
        (df["Recency_Score"] == 1),  # Can’t Lose
        (df["RFM_Score"] <= 4)  # Hibernating
    ]

    choices = [
        "Champions", "Loyal Customers", "Potential Loyalist",
        "Recent Customers", "Promising Customers", "Needs Attention",
        "About To Sleep", "At-Risk Customers", "Can’t Lose Them", "Hibernating"
    ]

    df["Segment"] = np.select(conditions, choices, default="Others")
    return df




In [None]:
# Normalize Data
import pickle
def normalize_rfm(rfm):
    scaler = MinMaxScaler()
    rfm_scaled = scaler.fit_transform(rfm)
    with open("scaler.pkl", "wb") as f:
      pickle.dump(scaler, f)

    return rfm_scaled


In [None]:
# Determine Optimal Clusters using Elbow Method
def find_optimal_clusters(rfm_scaled):
    inertia = []
    for k in range(1, 11):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(rfm_scaled)
        inertia.append(kmeans.inertia_)
    plt.figure(figsize=(8,5))
    plt.plot(range(1, 11), inertia, marker='o')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.title('Elbow Method')
    plt.show()

In [None]:
# Apply K-Means Clustering
def apply_kmeans(rfm_scaled, num_clusters=4):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(rfm_scaled)
    return clusters


In [None]:

# Save processed RFM data
def save_rfm_data(rfm):
    rfm.to_csv("rfm_data.csv", index=True)
# Run the functions
df = load_data()
df, snapshot_date = preprocess_data(df)
rfm=calculate_rfm(df,snapshot_date)
rfm_scaled = normalize_rfm(rfm)
rfm_scaled=pd.DataFrame(rfm_scaled,columns=['Recency','Frequency','Monetary'])

# Compute RFM scores
rfm_scores = compute_rfm_scores(rfm_scaled)

# Categorize RFM segments
rfm_segments = categorize_rfm_segments(rfm_scores)

# Save RFM data
save_rfm_data(rfm)

# Save RFM scores and segments (optional)
rfm_scores.to_csv("rfm_scores.csv", index=True)
rfm_segments.to_csv("rfm_segments.csv", index=True)



# Download the files from Colab
from google.colab import files
files.download("rfm_data.csv")
files.download("rfm_scores.csv")
files.download("rfm_segments.csv")
files.download("scaler.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>