<a href="https://colab.research.google.com/github/PallaviVangari/DataMiningAssignment3/blob/main/CRISP_DM_Online_Retail.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# # Mount Google Drive (you'll need to authenticate)
# from google.colab import drive
# drive.mount('/content/drive')

# Adjust the path based on where you've stored the dataset in Google Drive
dataset_path = '/content/online_retail_II.xlsx'

# Load the data
retail_data = pd.read_excel(dataset_path)

# Data Preparation

# Fill missing descriptions with 'Unknown'
retail_data['Description'].fillna('Unknown', inplace=True)

# Drop rows where 'Customer ID' is missing
retail_data.dropna(subset=['Customer ID'], inplace=True)

# Convert Customer ID to integer type
retail_data['Customer ID'] = retail_data['Customer ID'].astype(int)

# Remove entries with negative quantities
retail_data = retail_data[retail_data['Quantity'] > 0]

# Feature Engineering for Modeling

# Total number of purchases (frequency)
frequency = retail_data.groupby('Customer ID')['Invoice'].nunique()

# Total amount spent (monetary value)
retail_data['TotalCost'] = retail_data['Quantity'] * retail_data['Price']
monetary = retail_data.groupby('Customer ID')['TotalCost'].sum()

# Duration since the last purchase (recency)
latest_purchase = retail_data['InvoiceDate'].max() + pd.Timedelta(days=1)
recency = (latest_purchase - retail_data.groupby('Customer ID')['InvoiceDate'].max()).dt.days

# Average quantity of products bought
avg_quantity = retail_data.groupby('Customer ID')['Quantity'].mean()

# Average price of products bought
avg_price = retail_data.groupby('Customer ID')['Price'].mean()

# Combine features into a single DataFrame
customer_data = pd.DataFrame({
    'Frequency': frequency,
    'MonetaryValue': monetary,
    'Recency': recency,
    'AvgQuantity': avg_quantity,
    'AvgPrice': avg_price
})

# Scaling the features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(customer_data)

# Applying KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

# Add cluster assignments to the data
customer_data['Cluster'] = clusters

# Display the results
print(customer_data.head())

# You can then analyze the segments as we did above.




             Frequency  MonetaryValue  Recency  AvgQuantity  AvgPrice  Cluster
Customer ID                                                                   
12346               11         372.86      165     2.121212  6.253333        3
12347                2        1323.32        3    11.661972  2.295070        0
12348                1         222.16       74    18.650000  0.719500        0
12349                3        2671.14       43     9.735294  8.581765        0
12351                1         300.93       11    12.428571  2.355238        0
