In [1]:
'''
dataset of customer information for an online clothing store,
including age, gender, income, and purchase history. 
We want to group these customers into different clusters
based on their purchasing behavior.
'''

'\ndataset of customer information for an online clothing store,\nincluding age, gender, income, and purchase history. \nWe want to group these customers into different clusters\nbased on their purchasing behavior.\n'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.graph_objects as go
from scipy.interpolate import make_interp_spline, BSpline


In [None]:
# Generate false data with deliberate errors
data = pd.DataFrame({
    'Age': np.random.choice([25, 30, 35, 40, 45, 50, 55, 60, 65, 70], 40000),
    'Income': np.random.choice([25000, 35000, 45000, 55000, 65000, 75000, 85000, 95000], 40000),
    'SpendingScore': np.random.choice([10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 40000)
})

# Introduce deliberate errors to practice some data cleaning 
data.loc[10:20, 'Age'] = -1  # Invalid negative age values
data.loc[30:40, 'Income'] = 'unknown'  # Invalid string values for income
data.loc[50:60, 'SpendingScore'] = np.nan  # Missing values for spending score

# Save the dataset as a CSV file
data.to_csv('customer_data.csv', index=False)

In [None]:
# clean and preprocess data
# Handling Missing Data (nan) 
# easy way is to drop the number or fill it with 0 or...
def handle_missing_values(data, strategy='drop', fill_value=0,delete_unknown=True, delete_negative_one=True):
    if strategy == 'drop':
        # Drop rows with any missing values
        data_cleaned = data.dropna()
    elif strategy == 'fill':
        # Fill missing values with the specified fill_value
        if fill_value is None:
            raise ValueError("fill_value must be provided when strategy is 'fill'")
        # Fill missing values with the specified fill_value
        data_cleaned = data.fillna(fill_value)
    elif strategy == 'mean':
        # Fill missing values with the mean of each column
        data_cleaned = data.fillna(data.mean())
    else:
        raise ValueError("Invalid strategy. Options: 'drop', 'fill'")
    # Delete rows with unknown values    
    if delete_unknown:
        data_cleaned = data_cleaned[~data_cleaned.isin(['unknown', 'Unknown', 'UNKNOWN'])]

    # Delete rows with -1 values
    if delete_negative_one:
        data_cleaned = data_cleaned[data_cleaned != -1].dropna()

    return data_cleaned


# we can Create a linear regression model to predict missing values using KNN and using scikit library 

# Removing Duplicates: Duplicated data can be identified and removed to ensure data integrity. 
def remove_duplicates(data, columns=None, keep='first'):
    # Remove duplicates based on specified columns
    data_cleaned = data.drop_duplicates(subset=columns, keep=keep)

    return data_cleaned


datacleaned =handle_missing_values(data)
datacleaned


In [None]:
# a data quality tester to determine the quality of the cleaning function
def test_data_quality(data):
    # Check for NaN values
    nan_count = data.isna().sum().sum()
    if nan_count > 0:
        print(f"Test failed: {nan_count} NaN values found in the DataFrame.")
        return

    # Check for unknown values
    unknown_values = ['unknown', 'Unknown', 'UNKNOWN']
    unknown_count = data.isin(unknown_values).sum().sum()
    if unknown_count > 0:
        print(f"Test failed: {unknown_count} unknown values found in the DataFrame.")
        return

    # Check for -1 values
    negative_one_count = (data == -1).sum().sum()
    if negative_one_count > 0:
        print(f"Test failed: {negative_one_count} -1 values found in the DataFrame.")
        return

    print("Data quality test passed successfully!")
    
    
    
test_data_quality(datacleaned)

In [32]:
'''
# we'll extract the relevant features from the dataset and
# normalize them to have a mean of 0 and a standard deviation of 1:
# Feature scaling ensures that features have similar scales to prevent domination by one feature due to magnitude differences.
# for machine learning purpose for later 
'''
X = datacleaned[['Age', 'Income', 'SpendingScore']].values
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

In [None]:
#Then, we'll use the k-means algorithm to cluster 
#the data into a specified number of clusters (in this case, 
#let's say 5):
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)


In [None]:
#We can then get the cluster assignments for each data point:
labels = kmeans.labels_

#We can also get the coordinates of the cluster centers:
centroids = kmeans.cluster_centers_


In [None]:
#visualize the clusters using a scatter plot:
plt.scatter(X[:,0], X[:,1], c=labels, cmap='rainbow')
plt.scatter(centroids[:,0], centroids[:,1], marker='x', s=200, linewidths=3, color='black')
plt.xlabel('Age')
plt.ylabel('Income')
plt.show()
