In [1]:
import sys
import os

# Add parent directory (../) to Python's search path
sys.path.append(os.path.abspath(".."))

In [2]:
import numpy as np
import pandas as pd


DATA IMPORT

In [3]:
raw_data = pd.read_excel('/workspaces/GEN-AI-DATA-ANALYST/data/e_commerce.xlsx')

In [4]:
raw_data.head()

Unnamed: 0,index,Order ID,Cust ID,Gender,Age,Date,Status,Channel,SKU,Category,Size,Qty,currency,Amount,ship-city,ship-postal-code,ship-country,B2B
0,1,171-1029312-3038738,1029312,Women,44,2022-12-04,Delivered,Myntra,JNE1233-BLUE-KR-031-XXL,kurta,XXL,1,INR,376,MOHALI,140301,IN,False
1,2,405-2183842-2225946,2183842,Women,29,2022-12-04,Delivered,Ajio,SET414-KR-NP-L,Set,L,1,INR,1449,GURUGRAM,122002,IN,False
2,3,171-1641533-8921966,1641533,Women,67,2022-12-04,Delivered,Myntra,SET261-KR-PP-S,Set,S,1,INR,453,KOLKATA,700029,IN,False
3,4,404-7490807-6300351,7490807,Women,20,2022-12-04,Delivered,Amazon,SET110-KR-PP-M,Set,M,1,INR,729,THANJAVUR,613007,IN,False
4,5,403-9293516-4577154,9293516,Women,62,2022-12-04,Delivered,Myntra,JNE2294-KR-A-XXL,kurta,XXL,1,INR,544,GURUGRAM,122001,IN,False


In [5]:
def scalable_data_profiler(df, sample_size=10000, top_n=5):
    print("📊 BASIC INFO")
    print(f"- Shape: {df.shape}")
    print(f"- Columns: {list(df.columns[:10])}...")  # show only first 10
    print(f"- Memory Usage: ~{df.memory_usage(deep=False).sum() / 1024**2:.2f} MB\n")

    print("🔍 MISSING VALUES (%):")
    missing = df.isnull().mean() * 100
    print(missing[missing > 0].sort_values(ascending=False).head(10), "\n")

    print("🧮 UNIQUE VALUES (Top 10 Columns):")
    unique_counts = df.nunique().sort_values(ascending=False).head(10)
    print(unique_counts, "\n")

    print("📈 NUMERICAL STATS (Sampled if > sample_size):")
    df_sample = df.sample(sample_size) if len(df) > sample_size else df
    print(df_sample.describe(include=[np.number]).T, "\n")

    print("🗂️ CATEGORICAL PREVIEW:")
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols[:10]:  # process only first 10 for speed
        print(f"\n🔹 Column: {col}")
        print(f" - Unique: {df[col].nunique()}")
        print(f" - Top {top_n}:\n{df[col].astype(str).value_counts(dropna=False).head(top_n)}")

    print("\n📊 CORRELATION MATRIX (Top Pairs Only):")
    num_cols = df.select_dtypes(include=np.number)
    if num_cols.shape[1] >= 2:
        corr = num_cols.corr().abs().unstack().sort_values(ascending=False)
        corr = corr[corr < 1.0].drop_duplicates().head(10)
        print(corr)
    else:
        print(" - Not enough numerical columns for correlation.")

In [6]:
scalable_data_profiler(raw_data)

📊 BASIC INFO
- Shape: (31047, 18)
- Columns: ['index', 'Order ID', 'Cust ID', 'Gender', 'Age', 'Date', 'Status', 'Channel ', 'SKU', 'Category']...
- Memory Usage: ~4.06 MB

🔍 MISSING VALUES (%):
Series([], dtype: float64) 

🧮 UNIQUE VALUES (Top 10 Columns):
index               31047
Order ID            28471
Cust ID             28437
SKU                  5287
ship-postal-code     4958
ship-city            2603
Amount                769
Age                    61
Date                   36
Size                   11
dtype: int64 

📈 NUMERICAL STATS (Sampled if > sample_size):
                    count          mean           std       min         25%  \
index             10000.0  1.561892e+04  9.001360e+03       4.0     7837.75   
Cust ID           10000.0  4.964314e+06  2.899815e+06     895.0  2447515.00   
Age               10000.0  3.936940e+01  1.511619e+01      18.0       27.00   
Amount            10000.0  6.841969e+02  2.677823e+02     229.0      487.00   
ship-postal-code  10000.0 

In [7]:
category_summary = {}

for col in raw_data.columns:
    unique_vals = raw_data[col].dropna().unique()
    num_unique = len(unique_vals)
    category_summary[col] = {
        "Num_Unique": num_unique,
        "Sample_Values": unique_vals[:10]  # show only first 10 for brevity
    }

# Display the results
for col, info in category_summary.items():
    print(f"\nColumn: {col}")
    print(f" - Number of Unique Categories: {info['Num_Unique']}")
    print(f" - Sample Categories: {info['Sample_Values']}")


Column: index
 - Number of Unique Categories: 31047
 - Sample Categories: [ 1  2  3  4  5  6  7  8  9 10]

Column: Order ID
 - Number of Unique Categories: 28471
 - Sample Categories: ['171-1029312-3038738' '405-2183842-2225946' '171-1641533-8921966'
 '404-7490807-6300351' '403-9293516-4577154' '407-1298130-0368305'
 '171-5561216-3398711' '408-2935263-2935550' '404-2648970-9042715'
 '408-0265357-4939534']

Column: Cust ID
 - Number of Unique Categories: 28437
 - Sample Categories: [1029312 2183842 1641533 7490807 9293516 1298130 5561216 2935263 2648970
  265357]

Column: Gender
 - Number of Unique Categories: 4
 - Sample Categories: ['Women' 'Men' 'W' 'M']

Column: Age
 - Number of Unique Categories: 61
 - Sample Categories: [44 29 67 20 62 49 23 70 75 43]

Column: Date
 - Number of Unique Categories: 36
 - Sample Categories: <DatetimeArray>
['2022-12-04 00:00:00', '2022-11-04 00:00:00', '2022-10-04 00:00:00',
 '2022-09-04 00:00:00', '2022-08-04 00:00:00', '2022-07-04 00:00:00',
 '202

2.0 DATA PREPROCESSING

In [8]:
from genai_dataanalyst.assistant import AnalystAssistant
assistant = AnalystAssistant()

2.1 Solves Categorical Problem

If You standardized the Gender column to ensure it contains only 'Men' or 'Women' by replacing abbreviations: 'W' was changed to 'Women' and 'M' to 'Men'. This makes gender data clean and consistent for analysis.

In [9]:
new_data = assistant.clean(raw_data, prompt="Standardize the 'Gender' column so that it only contains 'Men' or 'Women'. Replace 'W' with 'Women' and 'M' with 'Men'.")

[INFO] [CLEAN] Prompt: Standardize the 'Gender' column so that it only contains 'Men' or 'Women'. Replace 'W' with 'Women' and 'M' with 'Men'.
[DEBUG] [CLEAN] Generated Code:
 df['Gender'] = df['Gender'].replace({'W': 'Women', 'M': 'Men'})


In [10]:
new_data['Gender'].unique() # New data

array(['Women', 'Men'], dtype=object)

In [11]:
raw_data['Gender'].unique()  # Old data

array(['Women', 'Men', 'W', 'M'], dtype=object)

If You cleaned the Qty column by converting text values like 'One' and 'Two' into their numeric forms 1 and 2, so the entire column now contains only numbers. This makes quantity data consistent and ready for calculations

In [12]:
new_data = assistant.clean(new_data, prompt="Standadize all textual quantity values in the Qty column (e.g., 'One', 'Two') to their numeric equivalents (e.g., 1, 2). Ensure the column is entirely numeric after conversion so datatype should be int ")

[INFO] [CLEAN] Prompt: Standadize all textual quantity values in the Qty column (e.g., 'One', 'Two') to their numeric equivalents (e.g., 1, 2). Ensure the column is entirely numeric after conversion so datatype should be int 
[DEBUG] [CLEAN] Generated Code:
 qty_dict = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5, 'Six': 6, 'Seven': 7, 'Eight': 8, 'Nine': 9, 'Ten': 10}
df['Qty'] = df['Qty'].map(qty_dict).fillna(df['Qty']).astype(int)




In [13]:
new_data['Qty'].unique() # New data


array([1, 2, 4, 3, 5])

In [14]:
raw_data['Qty'].unique()  # Old data

array([1, 'One', 2, 4, 3, 'Two', 5], dtype=object)

In [15]:
new_data.dtypes

index                        int64
Order ID                    object
Cust ID                      int64
Gender                      object
Age                          int64
Date                datetime64[ns]
Status                      object
Channel                     object
SKU                         object
Category                    object
Size                        object
Qty                          int64
currency                    object
Amount                       int64
ship-city                   object
ship-postal-code             int64
ship-country                object
B2B                           bool
dtype: object

3.0 DATA TRANSFORMATION


3.1 Playing With Columns.

I can extract the month and year from the Date column to create two new columns: Order_Month and Order_Year. This makes it easier to analyze sales trends over time, like comparing orders across different months or years.

In [16]:
transformed_data = assistant.clean(new_data, prompt="From the Date column, extract the month and year into two new columns: Order_Month (as full month name) and Order_Year (as 4-digit year).Finally place both the newly created columns after Date column")

[INFO] [CLEAN] Prompt: From the Date column, extract the month and year into two new columns: Order_Month (as full month name) and Order_Year (as 4-digit year).Finally place both the newly created columns after Date column
[DEBUG] [CLEAN] Generated Code:
 df[['Order_Month', 'Order_Year']] = df['Date'].dt.to_period('M').dt.strftime('%B %Y').str.split(' ', expand=True)
df = df[['index', 'Order ID', 'Cust ID', 'Gender', 'Age', 'Date', 'Order_Month', 'Order_Year', 'Status', 'Channel ', 'SKU', 'Category', 'Size', 'Qty', 'currency', 'Amount', 'ship-city', 'ship-postal-code', 'ship-country', 'B2B']]


In [17]:
transformed_data.head(3)

Unnamed: 0,index,Order ID,Cust ID,Gender,Age,Date,Order_Month,Order_Year,Status,Channel,SKU,Category,Size,Qty,currency,Amount,ship-city,ship-postal-code,ship-country,B2B
0,1,171-1029312-3038738,1029312,Women,44,2022-12-04,December,2022,Delivered,Myntra,JNE1233-BLUE-KR-031-XXL,kurta,XXL,1,INR,376,MOHALI,140301,IN,False
1,2,405-2183842-2225946,2183842,Women,29,2022-12-04,December,2022,Delivered,Ajio,SET414-KR-NP-L,Set,L,1,INR,1449,GURUGRAM,122002,IN,False
2,3,171-1641533-8921966,1641533,Women,67,2022-12-04,December,2022,Delivered,Myntra,SET261-KR-PP-S,Set,S,1,INR,453,KOLKATA,700029,IN,False



Suppose i want to create a new column called Age_Group that classifies each person based on their age: anyone under 20 is labeled a Teenager, those between 20 and 49 are labeled Adult, and those 50 or older are labeled Senior. This helps group customers into age based categories for better analysis.

In [18]:
transformed_data = assistant.clean(transformed_data, prompt="Create a new column of 'Age' column named 'Age_Group'.Place the 'Age_Group' column besides 'Age' column  based on the Age column: If Age < 20 → Teenager , If Age ≥ 20 and < 50 → Adult , If Age ≥ 50 → Senior")

[INFO] [CLEAN] Prompt: Create a new column of 'Age' column named 'Age_Group'.Place the 'Age_Group' column besides 'Age' column  based on the Age column: If Age < 20 → Teenager , If Age ≥ 20 and < 50 → Adult , If Age ≥ 50 → Senior
[DEBUG] [CLEAN] Generated Code:
 df['Age_Group'] = np.select([df['Age'] < 20, (df['Age'] >= 20) & (df['Age'] < 50), df['Age'] >= 50], ['Teenager', 'Adult', 'Senior'])
cols = df.columns.tolist()
cols.insert(cols.index('Age')+1, 'Age_Group')
df = df[cols]
[ERROR] Code execution failed: Choicelist and default value do not have a common dtype: The DType <class 'numpy.dtypes._PyLongDType'> could not be promoted by <class 'numpy.dtypes.StrDType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.StrDType'>, <class 'numpy.dtypes.StrDType'>, <class 'numpy.dtypes.StrDType'>, <class 'numpy.dtypes._PyLongDType'>)
[DEBUG] Failed c

In [19]:
transformed_data.head(3)

Unnamed: 0,index,Order ID,Cust ID,Gender,Age,Date,Order_Month,Order_Year,Status,Channel,SKU,Category,Size,Qty,currency,Amount,ship-city,ship-postal-code,ship-country,B2B
0,1,171-1029312-3038738,1029312,Women,44,2022-12-04,December,2022,Delivered,Myntra,JNE1233-BLUE-KR-031-XXL,kurta,XXL,1,INR,376,MOHALI,140301,IN,False
1,2,405-2183842-2225946,2183842,Women,29,2022-12-04,December,2022,Delivered,Ajio,SET414-KR-NP-L,Set,L,1,INR,1449,GURUGRAM,122002,IN,False
2,3,171-1641533-8921966,1641533,Women,67,2022-12-04,December,2022,Delivered,Myntra,SET261-KR-PP-S,Set,S,1,INR,453,KOLKATA,700029,IN,False


In [20]:
transformed_data = assistant.clean(transformed_data, prompt="Drop the following columns from the dataset currency, ship-country, index")

[INFO] [CLEAN] Prompt: Drop the following columns from the dataset currency, ship-country, index
[DEBUG] [CLEAN] Generated Code:
 df.drop(['currency', 'ship-country', 'index'], axis=1, inplace=True)


In [21]:
transformed_data.head(2)

Unnamed: 0,Order ID,Cust ID,Gender,Age,Date,Order_Month,Order_Year,Status,Channel,SKU,Category,Size,Qty,Amount,ship-city,ship-postal-code,B2B
0,171-1029312-3038738,1029312,Women,44,2022-12-04,December,2022,Delivered,Myntra,JNE1233-BLUE-KR-031-XXL,kurta,XXL,1,376,MOHALI,140301,False
1,405-2183842-2225946,2183842,Women,29,2022-12-04,December,2022,Delivered,Ajio,SET414-KR-NP-L,Set,L,1,1449,GURUGRAM,122002,False


In [22]:
category_summary_transformed = {}

for col in transformed_data.columns:
    unique_vals = transformed_data[col].dropna().unique()
    num_unique = len(unique_vals)
    category_summary_transformed[col] = {
        "Num_Unique": num_unique,
        "Sample_Values": unique_vals[:10]  # show only first 10 for brevity
    }

# Display the results
for col, info in category_summary_transformed.items():
    print(f"\nColumn: {col}")
    print(f" - Number of Unique Categories: {info['Num_Unique']}")
    print(f" - Sample Categories: {info['Sample_Values']}")



Column: Order ID
 - Number of Unique Categories: 28471
 - Sample Categories: ['171-1029312-3038738' '405-2183842-2225946' '171-1641533-8921966'
 '404-7490807-6300351' '403-9293516-4577154' '407-1298130-0368305'
 '171-5561216-3398711' '408-2935263-2935550' '404-2648970-9042715'
 '408-0265357-4939534']

Column: Cust ID
 - Number of Unique Categories: 28437
 - Sample Categories: [1029312 2183842 1641533 7490807 9293516 1298130 5561216 2935263 2648970
  265357]

Column: Gender
 - Number of Unique Categories: 2
 - Sample Categories: ['Women' 'Men']

Column: Age
 - Number of Unique Categories: 61
 - Sample Categories: [44 29 67 20 62 49 23 70 75 43]

Column: Date
 - Number of Unique Categories: 36
 - Sample Categories: <DatetimeArray>
['2022-12-04 00:00:00', '2022-11-04 00:00:00', '2022-10-04 00:00:00',
 '2022-09-04 00:00:00', '2022-08-04 00:00:00', '2022-07-04 00:00:00',
 '2022-06-04 00:00:00', '2022-05-04 00:00:00', '2022-04-04 00:00:00',
 '2022-03-04 00:00:00']
Length: 10, dtype: datetim

In [23]:
transformed_data = assistant.transform(transformed_data, prompt="Encode the 'gender' column using LabelEncoder")

[INFO] [TRANSFORM] Prompt: Encode the 'gender' column using LabelEncoder
[DEBUG] [TRANSFORM] Generated Code:
 from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])


In [28]:
transformed_data.head(2)
transformed_data['Gender'].unique()

array([1, 0])

Complex Transformation Problem

You calculated how much each customer spent in total, then used that to create a new column called CLV_Tier that classifies customers as Low, Medium, High, or VIP. This helps identify your most valuable customers for marketing and sales strategies.

##### Why Complex?
Requires grouping and aggregation by Cust ID.

Involves merging back the tiered result into the original dataset.

Prepares data for segmented marketing, loyalty programs, and revenue prediction models.