# Loading the data

In [1]:
import pandas as pd

#Read the Data
df = pd.read_csv('/content/Orders_Analysis.csv')

#Inspect the data
df.head()

Unnamed: 0,product_title,product_type,variant_title,variant_sku,variant_id,customer_id,order_id,day,net_quantity,gross_sales,discounts,returns,net_sales,taxes,total_sales,returned_item_quantity,ordered_item_quantity
0,DPR,DPR,100,AD-982-708-895-F-6C894FB,52039657,1312378,83290718932496,04/12/2018,2,200.0,-200.0,0.0,0.0,0.0,0.0,0,2
1,RJF,Product P,28 / A / MTM,83-490-E49-8C8-8-3B100BC,56914686,3715657,36253792848113,01/04/2019,2,190.0,-190.0,0.0,0.0,0.0,0.0,0,2
2,CLH,Product B,32 / B / FtO,68-ECA-BC7-3B2-A-E73DE1B,24064862,9533448,73094559597229,05/11/2018,0,164.8,-156.56,-8.24,0.0,0.0,0.0,-2,2
3,NMA,Product F,40 / B / FtO,6C-1F1-226-1B3-2-3542B41,43823868,4121004,53616575668264,19/02/2019,1,119.0,-119.0,0.0,0.0,0.0,0.0,0,1
4,NMA,Product F,40 / B / FtO,6C-1F1-226-1B3-2-3542B41,43823868,4121004,29263220319421,19/02/2019,1,119.0,-119.0,0.0,0.0,0.0,0.0,0,1


# Filtering and cleaning the datarows

In [2]:
#Filter the data
df = df[df['ordered_item_quantity'] > 0]

#Inspect the data
df.head()

Unnamed: 0,product_title,product_type,variant_title,variant_sku,variant_id,customer_id,order_id,day,net_quantity,gross_sales,discounts,returns,net_sales,taxes,total_sales,returned_item_quantity,ordered_item_quantity
0,DPR,DPR,100,AD-982-708-895-F-6C894FB,52039657,1312378,83290718932496,04/12/2018,2,200.0,-200.0,0.0,0.0,0.0,0.0,0,2
1,RJF,Product P,28 / A / MTM,83-490-E49-8C8-8-3B100BC,56914686,3715657,36253792848113,01/04/2019,2,190.0,-190.0,0.0,0.0,0.0,0.0,0,2
2,CLH,Product B,32 / B / FtO,68-ECA-BC7-3B2-A-E73DE1B,24064862,9533448,73094559597229,05/11/2018,0,164.8,-156.56,-8.24,0.0,0.0,0.0,-2,2
3,NMA,Product F,40 / B / FtO,6C-1F1-226-1B3-2-3542B41,43823868,4121004,53616575668264,19/02/2019,1,119.0,-119.0,0.0,0.0,0.0,0.0,0,1
4,NMA,Product F,40 / B / FtO,6C-1F1-226-1B3-2-3542B41,43823868,4121004,29263220319421,19/02/2019,1,119.0,-119.0,0.0,0.0,0.0,0.0,0,1


# Aggregating and Encoding Data.

In [5]:
def encode_column(column):
    if column > 0:
        return 1
    if column <= 0:
        return 0

columns_list = ['customer_id','product_type']

aggregated_dataframe = df.groupby(columns_list).ordered_item_quantity.count().reset_index()

# Make sure 'ordered_item_quantity' exists in the DataFrame
aggregated_dataframe['products_ordered'] = aggregated_dataframe['ordered_item_quantity'].apply(encode_column)

# Group by 'customer_id' and sum the 'products_ordered' values
customers_orders = aggregated_dataframe.groupby(columns_list[0]).products_ordered.sum().reset_index()

# Inspect data
customers_orders.head()

Unnamed: 0,customer_id,products_ordered
0,1000661,1
1,1001914,1
2,1002167,3
3,1002387,1
4,1002419,2



# Calculating Average Return Rate by Customer Order

In [6]:
ordered_sum_by_customer_order = df.groupby(['customer_id', 'order_id']).ordered_item_quantity.sum().reset_index()
returned_sum_by_customer_order = df.groupby(['customer_id', 'order_id']).returned_item_quantity.sum().reset_index()

# Merge DataFrames using 'inner' join
ordered_returned_sums = pd.merge(ordered_sum_by_customer_order, returned_sum_by_customer_order)

# Calculate 'average_return_rate'
ordered_returned_sums['average_return_rate'] = -ordered_returned_sums['returned_item_quantity'] / ordered_returned_sums['ordered_item_quantity']

#Inspect Data
ordered_returned_sums.head()

Unnamed: 0,customer_id,order_id,ordered_item_quantity,returned_item_quantity,average_return_rate
0,1000661,99119989117212,3,0,0.0
1,1001914,79758569034715,1,0,0.0
2,1002167,38156088848638,1,0,0.0
3,1002167,57440147820257,1,0,0.0
4,1002167,58825523953710,1,0,0.0


# Analyzing Customer Return Rates.

In [7]:
# Task 1: Calculate 'customer_return_rate' using .groupby() method
customer_return_rate = ordered_returned_sums.groupby('customer_id')['average_return_rate'].mean().reset_index()

# Task 2: Create 'return_rates' DataFrame using .value_counts() method on the specific column
return_rates = customer_return_rate['average_return_rate'].value_counts().reset_index()

# Task 3: Rename columns in 'return_rates' DataFrame
return_rates.rename(columns={"count": "count of unit return rate", "average_return_rate": "average return rate"}, inplace=True)

# Task 4: Merge 'customers_orders' with 'customer_return_rate' based on "customer_id" column
customers = pd.merge(customers_orders, customer_return_rate, on='customer_id')

# Display the first few rows of 'return_rates'
customers.head()

Unnamed: 0,customer_id,products_ordered,average_return_rate
0,1000661,1,0.0
1,1001914,1,0.0
2,1002167,3,0.0
3,1002387,1,0.0
4,1002419,2,0.0


# Calculating Customer Total Spending.

In [8]:
customer_total_spending = df.groupby('customer_id').total_sales.sum().reset_index()
customer_total_spending.rename(columns={'total_sales': 'total_spending'}, inplace=True)
customer_total_spending.head()

Unnamed: 0,customer_id,total_spending
0,1000661,260.0
1,1001914,79.2
2,1002167,234.2
3,1002387,89.0
4,1002419,103.0


# Merging and Cleaning Customer Data.

In [9]:
customers = customers.merge(customer_total_spending,on="customer_id")

customers.drop(columns="customer_id",inplace=True)

customers

Unnamed: 0,products_ordered,average_return_rate,total_spending
0,1,0.0,260.0
1,1,0.0,79.2
2,3,0.0,234.2
3,1,0.0,89.0
4,2,0.0,103.0
...,...,...,...
24869,2,0.0,259.0
24870,2,0.0,242.5
24871,1,0.0,89.0
24872,2,0.0,267.0


# Transforming and Enhancing Customer Data.

In [12]:
import numpy as np
columns = ["products_ordered", "average_return_rate", "total_spending"]
for column in columns:
    transformed_column = np.log1p(customers[column])
    rounded_column = transformed_column.round(2)  # Round to 2 decimal places
    customers["log_" + column] = rounded_column
customers

Unnamed: 0,products_ordered,average_return_rate,total_spending,log_products_ordered,log_average_return_rate,log_total_spending
0,1,0.0,260.0,0.69,0.0,5.56
1,1,0.0,79.2,0.69,0.0,4.38
2,3,0.0,234.2,1.39,0.0,5.46
3,1,0.0,89.0,0.69,0.0,4.50
4,2,0.0,103.0,1.10,0.0,4.64
...,...,...,...,...,...,...
24869,2,0.0,259.0,1.10,0.0,5.56
24870,2,0.0,242.5,1.10,0.0,5.50
24871,1,0.0,89.0,0.69,0.0,4.50
24872,2,0.0,267.0,1.10,0.0,5.59


#K-Means Clustering and Scoring.

In [13]:
# Module 3 - Task 1
from sklearn.cluster import KMeans

kmeans_model = KMeans(init='k-means++', max_iter=500, random_state=42, n_init=10)

kmeans_model.fit(customers.iloc[:, 3:])

kmeans_score = round(kmeans_model.inertia_, 2)
kmeans_score

1068.82

#Determining the Number of Clusters (K) for K-Means.

In [14]:
dataframe = customers.iloc[:, 3:]
K = 15
cluster_values = list(range(1, K+1))
inertia_values=[]

for c in cluster_values:
    model = KMeans(n_clusters = c, init='k-means++', max_iter=500, random_state=42, n_init=10)
    model.fit(dataframe)
    inertia_values.append(round(model.inertia_, 2))

inertia_values

[14774.21,
 5397.28,
 3019.6,
 2336.14,
 1926.13,
 1570.12,
 1351.41,
 1068.82,
 924.6,
 812.63,
 695.21,
 620.87,
 555.25,
 497.73,
 443.45]

#Applying K-Means Clustering with Optimized K

In [16]:
updated_kmeans_model = KMeans(n_clusters=4, init='k-means++', max_iter=500, random_state=42, n_init=10)

res = updated_kmeans_model.fit_predict(customers.iloc[:, 3:])
res

array([2, 1, 0, ..., 1, 0, 1], dtype=int32)

#Calculating Cluster Centers and Creating a Final Customer DataFrame.

In [17]:
cluster_centers = updated_kmeans_model.cluster_centers_
customers["clusters"] = updated_kmeans_model.labels_


actual_data = np.expm1(cluster_centers)
add_points = np.append(actual_data, cluster_centers, axis=1)

add_points = np.append(add_points, [[0], [1], [2], [3]], axis=1)



centers_df = pd.DataFrame(data=add_points, columns=["products_ordered",
                                                    "average_return_rate",
                                                    "total_spending",
                                                    "log_products_ordered",
                                                    "log_average_return_rate",
                                                    "log_total_spending",
                                                    "clusters"])

centers_df["clusters"] = centers_df["clusters"].astype("int")

# Round the values in the centers_df DataFrame to 2 decimal places
rounded_centers_df = centers_df.round(2)

customers_final = customers.copy()

# Combining Customer Data with Cluster Centers.

In [18]:
customers["is_center"] = 0
rounded_centers_df["is_center"] = 1

# Append the contents of rounded_centers_df to customers with ignore_index=True
customers = pd.concat([customers_final, rounded_centers_df], ignore_index=True)

# The customers DataFrame now contains the combined data from both DataFrames
customers

Unnamed: 0,products_ordered,average_return_rate,total_spending,log_products_ordered,log_average_return_rate,log_total_spending,clusters,is_center
0,1.00,0.0,260.00,0.69,0.0,5.56,2,
1,1.00,0.0,79.20,0.69,0.0,4.38,1,
2,3.00,0.0,234.20,1.39,0.0,5.46,0,
3,1.00,0.0,89.00,0.69,0.0,4.50,1,
4,2.00,0.0,103.00,1.10,0.0,4.64,2,
...,...,...,...,...,...,...,...,...
24873,1.00,0.0,64.00,0.69,0.0,4.17,1,
24874,2.40,0.0,283.73,1.22,0.0,5.65,0,1.0
24875,1.01,0.0,76.45,0.70,0.0,4.35,1,1.0
24876,1.53,0.0,159.92,0.93,0.0,5.08,2,1.0


In [19]:
customers["cluster_name"] = customers["clusters"].astype(str)
final_result = customers.cluster_name.value_counts().reset_index()

final_result.rename(columns={"cluster_name": "Customer Groups", "count": "Customer Group Magnitude"}, inplace=True)

final_result

Unnamed: 0,index,Customer Groups
0,1,10468
1,2,7236
2,0,5098
3,3,2076
