In [9]:
import sys
import os

# Add parent directory (../) to Python's search path
sys.path.append(os.path.abspath(".."))

In [10]:
import numpy as np
import pandas as pd


DATA IMPORT

In [11]:
raw_data = pd.read_excel('/workspaces/GEN-AI-DATA-ANALYST/data/e_commerce.xlsx')

In [12]:
raw_data.head()

Unnamed: 0,index,Order ID,Cust ID,Gender,Age,Date,Status,Channel,SKU,Category,Size,Qty,currency,Amount,ship-city,ship-postal-code,ship-country,B2B
0,1,171-1029312-3038738,1029312,Women,44,2022-12-04,Delivered,Myntra,JNE1233-BLUE-KR-031-XXL,kurta,XXL,1,INR,376,MOHALI,140301,IN,False
1,2,405-2183842-2225946,2183842,Women,29,2022-12-04,Delivered,Ajio,SET414-KR-NP-L,Set,L,1,INR,1449,GURUGRAM,122002,IN,False
2,3,171-1641533-8921966,1641533,Women,67,2022-12-04,Delivered,Myntra,SET261-KR-PP-S,Set,S,1,INR,453,KOLKATA,700029,IN,False
3,4,404-7490807-6300351,7490807,Women,20,2022-12-04,Delivered,Amazon,SET110-KR-PP-M,Set,M,1,INR,729,THANJAVUR,613007,IN,False
4,5,403-9293516-4577154,9293516,Women,62,2022-12-04,Delivered,Myntra,JNE2294-KR-A-XXL,kurta,XXL,1,INR,544,GURUGRAM,122001,IN,False


In [13]:
def scalable_data_profiler(df, sample_size=10000, top_n=5):
    print("📊 BASIC INFO")
    print(f"- Shape: {df.shape}")
    print(f"- Columns: {list(df.columns[:10])}...")  # show only first 10
    print(f"- Memory Usage: ~{df.memory_usage(deep=False).sum() / 1024**2:.2f} MB\n")

    print("🔍 MISSING VALUES (%):")
    missing = df.isnull().mean() * 100
    print(missing[missing > 0].sort_values(ascending=False).head(10), "\n")

    print("🧮 UNIQUE VALUES (Top 10 Columns):")
    unique_counts = df.nunique().sort_values(ascending=False).head(10)
    print(unique_counts, "\n")

    print("📈 NUMERICAL STATS (Sampled if > sample_size):")
    df_sample = df.sample(sample_size) if len(df) > sample_size else df
    print(df_sample.describe(include=[np.number]).T, "\n")

    print("🗂️ CATEGORICAL PREVIEW:")
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols[:10]:  # process only first 10 for speed
        print(f"\n🔹 Column: {col}")
        print(f" - Unique: {df[col].nunique()}")
        print(f" - Top {top_n}:\n{df[col].astype(str).value_counts(dropna=False).head(top_n)}")

    print("\n📊 CORRELATION MATRIX (Top Pairs Only):")
    num_cols = df.select_dtypes(include=np.number)
    if num_cols.shape[1] >= 2:
        corr = num_cols.corr().abs().unstack().sort_values(ascending=False)
        corr = corr[corr < 1.0].drop_duplicates().head(10)
        print(corr)
    else:
        print(" - Not enough numerical columns for correlation.")

In [14]:
scalable_data_profiler(raw_data)

📊 BASIC INFO
- Shape: (31047, 18)
- Columns: ['index', 'Order ID', 'Cust ID', 'Gender', 'Age', 'Date', 'Status', 'Channel ', 'SKU', 'Category']...
- Memory Usage: ~4.06 MB

🔍 MISSING VALUES (%):
Series([], dtype: float64) 

🧮 UNIQUE VALUES (Top 10 Columns):
index               31047
Order ID            28471
Cust ID             28437
SKU                  5287
ship-postal-code     4958
ship-city            2603
Amount                769
Age                    61
Date                   36
Size                   11
dtype: int64 

📈 NUMERICAL STATS (Sampled if > sample_size):
                    count          mean           std       min         25%  \
index             10000.0  1.554422e+04  8.967173e+03       1.0     7764.25   
Cust ID           10000.0  4.909274e+06  2.903489e+06    1387.0  2364661.00   
Age               10000.0  3.936750e+01  1.508739e+01      18.0       27.00   
Amount            10000.0  6.780895e+02  2.647286e+02     229.0      480.00   
ship-postal-code  10000.0 

2.0 DATA PREPROCESSING

In [15]:
from genai_dataanalyst.assistant import AnalystAssistant
assistant = AnalystAssistant()

2.1 Solves Categorical Problem

If You standardized the Gender column to ensure it contains only 'Men' or 'Women' by replacing abbreviations: 'W' was changed to 'Women' and 'M' to 'Men'. This makes gender data clean and consistent for analysis.

In [16]:
new_data = assistant.clean(raw_data, prompt="Standardize the 'Gender' column so that it only contains 'Men' or 'Women'. Replace 'W' with 'Women' and 'M' with 'Men'.")

[INFO] [CLEAN] Prompt: Standardize the 'Gender' column so that it only contains 'Men' or 'Women'. Replace 'W' with 'Women' and 'M' with 'Men'.


[DEBUG] [CLEAN] Generated Code:
 df['Gender'] = df['Gender'].replace({'W': 'Women', 'M': 'Men'})


In [17]:
new_data['Gender'].unique() # New data

array(['Women', 'Men'], dtype=object)

In [18]:
raw_data['Gender'].unique()  # Old data

array(['Women', 'Men', 'W', 'M'], dtype=object)

If You cleaned the Qty column by converting text values like 'One' and 'Two' into their numeric forms 1 and 2, so the entire column now contains only numbers. This makes quantity data consistent and ready for calculations

In [19]:
new_data = assistant.clean(new_data, prompt="Standadize all textual quantity values in the Qty column (e.g., 'One', 'Two') to their numeric equivalents (e.g., 1, 2). Ensure the column is entirely numeric after conversion so datatype should be int ")

[INFO] [CLEAN] Prompt: Standadize all textual quantity values in the Qty column (e.g., 'One', 'Two') to their numeric equivalents (e.g., 1, 2). Ensure the column is entirely numeric after conversion so datatype should be int 


[DEBUG] [CLEAN] Generated Code:
 qty_dict = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5, 'Six': 6, 'Seven': 7, 'Eight': 8, 'Nine': 9, 'Ten': 10}
df['Qty'] = df['Qty'].map(qty_dict).fillna(df['Qty']).astype(int)




In [20]:
new_data['Qty'].unique() # New data


array([1, 2, 4, 3, 5])

In [21]:
raw_data['Qty'].unique()  # Old data

array([1, 'One', 2, 4, 3, 'Two', 5], dtype=object)

In [22]:
new_data.dtypes

index                        int64
Order ID                    object
Cust ID                      int64
Gender                      object
Age                          int64
Date                datetime64[ns]
Status                      object
Channel                     object
SKU                         object
Category                    object
Size                        object
Qty                          int64
currency                    object
Amount                       int64
ship-city                   object
ship-postal-code             int64
ship-country                object
B2B                           bool
dtype: object

3.0 DATA TRANSFORMATION


3.1 Playing With Columns.

I can extract the month and year from the Date column to create two new columns: Order_Month and Order_Year. This makes it easier to analyze sales trends over time, like comparing orders across different months or years.

In [23]:
transformed_data = assistant.clean(new_data, prompt="From the Date column, extract the month and year into two new columns: Order_Month (as full month name) and Order_Year (as 4-digit year).Finally place both the newly created columns after Date column")

[INFO] [CLEAN] Prompt: From the Date column, extract the month and year into two new columns: Order_Month (as full month name) and Order_Year (as 4-digit year).Finally place both the newly created columns after Date column


[DEBUG] [CLEAN] Generated Code:
 df['Order_Month'] = pd.to_datetime(df['Date']).dt.strftime('%B')
df['Order_Year'] = pd.to_datetime(df['Date']).dt.year
cols = df.columns.tolist()
cols.insert(6, cols.pop(cols.index('Order_Month')))
cols.insert(6, cols.pop(cols.index('Order_Year')))
df = df[cols]


In [24]:
transformed_data.head(3)

Unnamed: 0,index,Order ID,Cust ID,Gender,Age,Date,Order_Year,Order_Month,Status,Channel,SKU,Category,Size,Qty,currency,Amount,ship-city,ship-postal-code,ship-country,B2B
0,1,171-1029312-3038738,1029312,Women,44,2022-12-04,2022,December,Delivered,Myntra,JNE1233-BLUE-KR-031-XXL,kurta,XXL,1,INR,376,MOHALI,140301,IN,False
1,2,405-2183842-2225946,2183842,Women,29,2022-12-04,2022,December,Delivered,Ajio,SET414-KR-NP-L,Set,L,1,INR,1449,GURUGRAM,122002,IN,False
2,3,171-1641533-8921966,1641533,Women,67,2022-12-04,2022,December,Delivered,Myntra,SET261-KR-PP-S,Set,S,1,INR,453,KOLKATA,700029,IN,False



Suppose i want to create a new column called Age_Group that classifies each person based on their age: anyone under 20 is labeled a Teenager, those between 20 and 49 are labeled Adult, and those 50 or older are labeled Senior. This helps group customers into age based categories for better analysis.

In [25]:
transformed_data = assistant.clean(transformed_data, prompt="Create a new column of 'Age' column named 'Age_Group'.Place the 'Age_Group' column besides 'Age' column  based on the Age column: If Age < 20 → Teenager , If Age ≥ 20 and < 50 → Adult , If Age ≥ 50 → Senior")

[INFO] [CLEAN] Prompt: Create a new column of 'Age' column named 'Age_Group'.Place the 'Age_Group' column besides 'Age' column  based on the Age column: If Age < 20 → Teenager , If Age ≥ 20 and < 50 → Adult , If Age ≥ 50 → Senior
[DEBUG] [CLEAN] Generated Code:
 df['Age_Group'] = pd.cut(df['Age'], bins=[0, 20, 50, np.inf], labels=['Teenager', 'Adult', 'Senior'])


In [26]:
transformed_data.head(3)

Unnamed: 0,index,Order ID,Cust ID,Gender,Age,Date,Order_Year,Order_Month,Status,Channel,...,Category,Size,Qty,currency,Amount,ship-city,ship-postal-code,ship-country,B2B,Age_Group
0,1,171-1029312-3038738,1029312,Women,44,2022-12-04,2022,December,Delivered,Myntra,...,kurta,XXL,1,INR,376,MOHALI,140301,IN,False,Adult
1,2,405-2183842-2225946,2183842,Women,29,2022-12-04,2022,December,Delivered,Ajio,...,Set,L,1,INR,1449,GURUGRAM,122002,IN,False,Adult
2,3,171-1641533-8921966,1641533,Women,67,2022-12-04,2022,December,Delivered,Myntra,...,Set,S,1,INR,453,KOLKATA,700029,IN,False,Senior


In [27]:
transformed_data = assistant.clean(transformed_data, prompt="Drop the following columns from the dataset currency, ship-country, index")

[INFO] [CLEAN] Prompt: Drop the following columns from the dataset currency, ship-country, index
[DEBUG] [CLEAN] Generated Code:
 df.drop(['currency', 'ship-country', 'index'], axis=1, inplace=True)


In [28]:
transformed_data.head(2)

Unnamed: 0,Order ID,Cust ID,Gender,Age,Date,Order_Year,Order_Month,Status,Channel,SKU,Category,Size,Qty,Amount,ship-city,ship-postal-code,B2B,Age_Group
0,171-1029312-3038738,1029312,Women,44,2022-12-04,2022,December,Delivered,Myntra,JNE1233-BLUE-KR-031-XXL,kurta,XXL,1,376,MOHALI,140301,False,Adult
1,405-2183842-2225946,2183842,Women,29,2022-12-04,2022,December,Delivered,Ajio,SET414-KR-NP-L,Set,L,1,1449,GURUGRAM,122002,False,Adult


In [29]:
scalable_data_profiler(transformed_data)


📊 BASIC INFO
- Shape: (31047, 18)
- Columns: ['Order ID', 'Cust ID', 'Gender', 'Age', 'Date', 'Order_Year', 'Order_Month', 'Status', 'Channel ', 'SKU']...
- Memory Usage: ~3.73 MB

🔍 MISSING VALUES (%):
Series([], dtype: float64) 

🧮 UNIQUE VALUES (Top 10 Columns):
Order ID            28471
Cust ID             28437
SKU                  5287
ship-postal-code     4958
ship-city            2603
Amount                769
Age                    61
Date                   36
Order_Month            12
Size                   11
dtype: int64 

📈 NUMERICAL STATS (Sampled if > sample_size):
                    count          mean           std       min         25%  \
Cust ID           10000.0  4.937902e+06  2.921340e+06    3290.0  2384306.00   
Age               10000.0  3.929980e+01  1.494150e+01      18.0       27.00   
Order_Year        10000.0  2.022000e+03  0.000000e+00    2022.0     2022.00   
Qty               10000.0  1.007400e+00  1.026952e-01       1.0        1.00   
Amount            

BY USING transform()

In [30]:
transformed_data = assistant.transform(transformed_data, prompt="Encode the 'gender' column using LabelEncoder")

[INFO] [TRANSFORM] Prompt: Encode the 'gender' column using LabelEncoder


[DEBUG] [TRANSFORM] Generated Code:
 from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])


In [31]:
transformed_data.head(2)
transformed_data['Gender'].unique()

array([1, 0])

ADVANCE DATA TRANSFORMATION SOLVED

1) How can we identify customers who might churn based on inactivity and declining purchase behavior?

To anticipate churn, identify customers who:

Placed their last order more than 90 days ago, AND

Have a decreasing order frequency compared to their historical pattern.

Flag these customers with a new column called Churn_Risk = True

In [32]:
transformed_data = assistant.transform(transformed_data, prompt="Create a new column 'Churn_Risk'.For each 'Cust ID' , check the number of days since their last order using the Date column. If their last order was more than 90 days ago, and their monthly order frequency is declining (e.g., more orders earlier in the year than recent months), set Churn_Risk = True.")

[INFO] [TRANSFORM] Prompt: Create a new column 'Churn_Risk'.For each 'Cust ID' , check the number of days since their last order using the Date column. If their last order was more than 90 days ago, and their monthly order frequency is declining (e.g., more orders earlier in the year than recent months), set Churn_Risk = True.
[DEBUG] [TRANSFORM] Generated Code:
 df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Order_Count'] = df.groupby(['Cust ID', 'Year', 'Month']).cumcount() + 1
df['Last_Order_Date'] = df.groupby('Cust ID')['Date'].transform('max')
df['Days_Since_Last_Order'] = (pd.to_datetime('today') - df['Last_Order_Date']).dt.days
df['Monthly_Order_Frequency'] = df.groupby(['Cust ID', 'Year', 'Month'])['Order ID'].transform('count')
df['Churn_Risk'] = (df['Days_Since_Last_Order'] > 90) & (df.groupby('Cust ID')['Monthly_Order_Frequency'].transform(lambda x: x.rolling(window=3).mean().shift().fillna(x.mean()) > x))


In [33]:
transformed_data.head(2)

Unnamed: 0,Order ID,Cust ID,Gender,Age,Date,Order_Year,Order_Month,Status,Channel,SKU,...,ship-postal-code,B2B,Age_Group,Month,Year,Order_Count,Last_Order_Date,Days_Since_Last_Order,Monthly_Order_Frequency,Churn_Risk
0,171-1029312-3038738,1029312,1,44,2022-12-04,2022,December,Delivered,Myntra,JNE1233-BLUE-KR-031-XXL,...,140301,False,Adult,12,2022,1,2022-12-04,935,1,False
1,405-2183842-2225946,2183842,1,29,2022-12-04,2022,December,Delivered,Ajio,SET414-KR-NP-L,...,122002,False,Adult,12,2022,1,2022-12-04,935,1,False


2 How can we identify repeat customers and understand how frequently they place orders?



Problem Statement:

For each customer (Cust ID) in the dataset:
Determine whether they are a repeat customer (i.e., placed more than one order).
For repeat customers, calculate the average number of days between their purchases using the Date column.

In [34]:
transformed_data = assistant.transform(transformed_data, prompt="Create a new column _Repeat_Customer by checking if each Cust ID appears more than once in the dataset. Then, create another column called Avg_Purchase_Interval that contains the average number of days between purchases for each customer using the Date column. For customers with only one order, set Avg_Purchase_Interval to NaN.")

[INFO] [TRANSFORM] Prompt: Create a new column _Repeat_Customer by checking if each Cust ID appears more than once in the dataset. Then, create another column called Avg_Purchase_Interval that contains the average number of days between purchases for each customer using the Date column. For customers with only one order, set Avg_Purchase_Interval to NaN.


[DEBUG] [TRANSFORM] Generated Code:
 df['_Repeat_Customer'] = df.groupby('Cust ID')['Cust ID'].transform('count') > 1
df['Avg_Purchase_Interval'] = df.groupby('Cust ID')['Date'].transform(lambda x: x.diff().mean() if len(x) > 1 else np.nan)
[ERROR] Code execution failed: name 'np' is not defined
[DEBUG] Failed code was:
 df['_Repeat_Customer'] = df.groupby('Cust ID')['Cust ID'].transform('count') > 1
df['Avg_Purchase_Interval'] = df.groupby('Cust ID')['Date'].transform(lambda x: x.diff().mean() if len(x) > 1 else np.nan)


In [35]:
scalable_data_profiler(transformed_data)

📊 BASIC INFO
- Shape: (31047, 25)
- Columns: ['Order ID', 'Cust ID', 'Gender', 'Age', 'Date', 'Order_Year', 'Order_Month', 'Status', 'Channel ', 'SKU']...
- Memory Usage: ~4.94 MB

🔍 MISSING VALUES (%):
Series([], dtype: float64) 

🧮 UNIQUE VALUES (Top 10 Columns):
Order ID                 28471
Cust ID                  28437
SKU                       5287
ship-postal-code          4958
ship-city                 2603
Amount                     769
Age                         61
Date                        36
Last_Order_Date             36
Days_Since_Last_Order       36
dtype: int64 

📈 NUMERICAL STATS (Sampled if > sample_size):
                           count          mean           std       min  \
Cust ID                  10000.0  4.946073e+06  2.895061e+06     895.0   
Gender                   10000.0  6.898000e-01  4.625985e-01       0.0   
Age                      10000.0  3.964220e+01  1.511247e+01      18.0   
Order_Year               10000.0  2.022000e+03  0.000000e+00    202

DATA VISUALIZATION

How does the number of orders vary across different months?

Problem Statement
You want to analyze the monthly trend of orders to understand business seasonality, peak sales periods, or dips in activity. This helps in forecasting, campaign planning, and resource allocation.

Plot a bar chart showing the total number of orders per Order_Month.

In [49]:
assistant.visualize(transformed_data, prompt="Create a bar chart showing the number of orders per Order_Month. Use the Order_Month column on the x-axis and the count of Order ID on the y-axis.Sort the months chronologically (not alphabetically), and label the axes and title clearly.", name="revenue_bar")

[INFO] [VISUALIZE] Prompt: Create a bar chart showing the number of orders per Order_Month. Use the Order_Month column on the x-axis and the count of Order ID on the y-axis.Sort the months chronologically (not alphabetically), and label the axes and title clearly.
[DEBUG] [VISUALIZE] Code:
 fig = go.Figure(data=[go.Bar(x=df['Order_Month'].unique(), y=df['Order_Month'].value_counts().sort_index())])
fig.update_layout(title='Number of Orders per Month', xaxis_title='Month', yaxis_title='Count of Orders', xaxis=dict(type='category', categoryorder='array', categoryarray=list(range(1, 13))))


In [50]:
len(assistant._charts)


1

In [51]:
for chart in assistant._charts:
    print("Chart Name:", chart["name"])
    print("Prompt Used:", chart["prompt"])
    print("---")


Chart Name: revenue_bar
Prompt Used: Create a bar chart showing the number of orders per Order_Month. Use the Order_Month column on the x-axis and the count of Order ID on the y-axis.Sort the months chronologically (not alphabetically), and label the axes and title clearly.
---


In [None]:
del assistant._charts[0:1:2]  # Deletes the first, second and third saved chart


In [48]:
for chart in assistant._charts:
    print("Chart Name:", chart["name"])
    print("Prompt Used:", chart["prompt"])
    print("---")
