In [8]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score

# Load the dataset
file_path = 'C:/Users/DELL/OneDrive/Documents/retail data set.csv'  # Update the path if necessary
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Dataset preview:")
print(df.head())

# Display all columns to identify potential features for clustering
print("\nColumn Names in the Dataset:")
print(df.columns)

# Basic data information and missing values check
print("\nData Summary:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# Step 1: Data Preprocessing
# Dropping rows with missing values for simplicity; you may use imputation techniques if necessary
df_cleaned = df.dropna()

# Display a sample of the cleaned data
print("\nCleaned Data Sample:")
print(df_cleaned.head())

# Step 2: Feature Selection
# Modify the feature selection based on your dataset's actual column names
# Let's assume we found relevant columns like 'TotalSpent' and 'TotalPurchases' in the previous output.
# Change the feature names accordingly to your dataset's actual columns.

# Example: Adjust this based on your actual column names
# Replace 'TotalSpent' and 'TotalPurchases' with appropriate column names from your dataset
features = ['TotalSpent', 'TotalPurchases']  # Modify based on your dataset

# Check if the specified features exist in the dataset
missing_features = [feature for feature in features if feature not in df_cleaned.columns]

if missing_features:
    print(f"Error: The following features are not in the dataset: {missing_features}")
else:
    # Step 3: Selecting Features for Clustering
    X = df_cleaned[features]
    
    # Step 4: Feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Step 5: K-means Clustering
    # Define a function to determine the optimal number of clusters using the Elbow Method
    def optimal_kmeans(X_scaled):
        wcss = []
        for i in range(1, 11):
            kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
            kmeans.fit(X_scaled)
            wcss.append(kmeans.inertia_)
        plt.plot(range(1, 11), wcss)
        plt.title('Elbow Method to Determine Optimal Number of Clusters')
        plt.xlabel('Number of clusters')
        plt.ylabel('WCSS')
        plt.show()

    # Plot the Elbow Curve to determine the optimal number of clusters
    optimal_kmeans(X_scaled)

    # Fit the K-means model with the optimal number of clusters
    optimal_clusters = 3  # Set the number of clusters based on the elbow method (modify based on your data)
    kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', random_state=42)
    kmeans_labels = kmeans.fit_predict(X_scaled)

    # Step 6: Analyze K-means Cluster Characteristics
    df_cleaned['KMeans_Cluster'] = kmeans_labels
    print("\nK-Means Cluster Counts:")
    print(df_cleaned['KMeans_Cluster'].value_counts())

    # Step 7: DBSCAN Clustering
    # Choose eps and min_samples based on your data, adjust accordingly
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    dbscan_labels = dbscan.fit_predict(X_scaled)

    # Step 8: Analyze DBSCAN Cluster Characteristics
    df_cleaned['DBSCAN_Cluster'] = dbscan_labels
    print("\nDBSCAN Cluster Counts:")
    print(df_cleaned['DBSCAN_Cluster'].value_counts())

    # Step 9: Evaluate Clustering Results
    # Silhouette Score for K-means
    kmeans_silhouette = silhouette_score(X_scaled, kmeans_labels)
    print(f"K-means Silhouette Score: {kmeans_silhouette}")

    # Silhouette Score for DBSCAN
    dbscan_silhouette = silhouette_score(X_scaled, dbscan_labels) if len(set(dbscan_labels)) > 1 else -1
    print(f"DBSCAN Silhouette Score: {dbscan_silhouette}")

    # Step 10: Visualize the Clusters (K-means)
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=X_scaled[:, 0], y=X_scaled[:, 1], hue=kmeans_labels, palette='Set1', s=100)
    plt.title('Customer Segments (K-means Clustering)')
    plt.xlabel('Feature 1: TotalSpent')
    plt.ylabel('Feature 2: TotalPurchases')
    plt.show()

    # Step 11: Visualize the Clusters (DBSCAN)
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=X_scaled[:, 0], y=X_scaled[:, 1], hue=dbscan_labels, palette='Set2', s=100)
    plt.title('Customer Segments (DBSCAN Clustering)')
    plt.xlabel('Feature 1: TotalSpent')
    plt.ylabel('Feature 2: TotalPurchases')
    plt.show()

    # Optional: Save the results to a new CSV
    df_cleaned.to_csv('C:/Users/DELL/OneDrive/Documents/customer_segmentation_output.csv', index=False)


  df = pd.read_csv(file_path)


Dataset preview:
   Store        Date  Temperature  Fuel_Price  MarkDown1  MarkDown2  \
0    1.0  05-02-2010        42.31       2.572        NaN        NaN   
1    1.0  12-02-2010        38.51       2.548        NaN        NaN   
2    1.0  19-02-2010        39.93       2.514        NaN        NaN   
3    1.0  26-02-2010        46.63       2.561        NaN        NaN   
4    1.0  05-03-2010        46.50       2.625        NaN        NaN   

   MarkDown3  MarkDown4  MarkDown5         CPI  Unemployment IsHoliday  Dept  \
0        NaN        NaN        NaN  211.096358         8.106     False     1   
1        NaN        NaN        NaN  211.242170         8.106      True     1   
2        NaN        NaN        NaN  211.289143         8.106     False     1   
3        NaN        NaN        NaN  211.319643         8.106     False     1   
4        NaN        NaN        NaN  211.350143         8.106     False     1   

   Weekly_Sales  IsHoliday.1 Type      Size  
0      24924.50        False 