Task 1:Create a pandas DataFrame from the CSV and from Python lists/dicts.

In [1]:
import pandas as pd
import random
import numpy as np


df_sales = pd.read_csv(r"D:\python files\pandas\sample.csv")


# 2. Extract unique values 

store_ids = df_sales["store_id"].unique().tolist()
store_names = df_sales["store_name"].unique().tolist()
product_ids = df_sales["product_id"].unique().tolist()
product_names = df_sales["product_name"].unique().tolist()

promo_codes = df_sales["promo_code"].fillna("NONE").unique().tolist()


# 3. Generate 1000 new synthetic rows

new_rows = []

start_txn = df_sales["transaction_id"].max() + 1

for i in range(1000):
    
    # Pick random store
    sid = random.choice(store_ids)
    
    # Pick random product
    pname = random.choice(product_names)
    pid = df_sales[df_sales["product_name"] == pname]["product_id"].iloc[0]
    
    qty = random.randint(1, 10)
    price = round(random.uniform(0.50, 15.00), 2)
    
    # Create a new random row following your schema
    row = {
        "transaction_id": start_txn + i,
        "store_id": sid,
        "store_name": random.choice(store_names),
        "date": f"2025-10-{random.randint(1, 28):02d}",
        "product_id": pid,
        "product_name": pname,
        "quantity": qty,
        "unit_price": price,
        "total": round(qty * price, 2),
        "customer_id": random.randint(2001, 5000),
        "promo_code": random.choice(promo_codes)
    }
    
    new_rows.append(row)


df_random = pd.DataFrame(new_rows)


# 5. Combine old + new rows

df_final = pd.concat([df_sales, df_random], ignore_index=True)


# 6. Save expanded dataset

df_final.to_csv(r"D:\python files\pandas\sample_expanded.csv", index=False)

print("Original rows:", len(df_sales))
print("New rows added: 1000")
print("Final dataset rows:", len(df_final))


Original rows: 10
New rows added: 1000
Final dataset rows: 1010


In [2]:
df_sales=df_final
df_sales["date"] = pd.to_datetime(df_sales["date"], errors="coerce")




In [3]:
sales_hyd = df_sales[df_sales["store_name"] == "Downtown"]
print(sales_hyd.head())

    transaction_id  store_id store_name       date  product_id product_name  \
0             1001         1   Downtown 2025-10-01         501     Notebook   
1             1002         1   Downtown 2025-10-01         502          Pen   
4             1005         1   Downtown 2025-10-03         504       Folder   
7             1008         1   Downtown 2025-10-04         502          Pen   
12            1013         1   Downtown 2025-10-20         501     Notebook   

    quantity  unit_price  total  customer_id promo_code  
0          2        3.50   7.00       2001.0        NaN  
1          5        0.80   4.00       2002.0     DISC10  
4          3        1.20   3.60       2004.0     DISC10  
7          2        0.80   1.60       2002.0        NaN  
12         7        1.71  11.97       4989.0      DISC5  


In [4]:
sales_store_1001 = df_sales[df_sales["store_id"] == 1]
print(sales_store_1001)


      transaction_id  store_id store_name       date  product_id product_name  \
0               1001         1   Downtown 2025-10-01         501     Notebook   
1               1002         1   Downtown 2025-10-01         502          Pen   
4               1005         1   Downtown 2025-10-03         504       Folder   
7               1008         1   Downtown 2025-10-04         502          Pen   
12              1013         1   Downtown 2025-10-20         501     Notebook   
...              ...       ...        ...        ...         ...          ...   
988             1989         1   Downtown 2025-10-19         501     Notebook   
995             1996         1   Downtown 2025-10-18         506  Highlighter   
1001            2002         1     Suburb 2025-10-23         501     Notebook   
1002            2003         1   Downtown 2025-10-24         505       Marker   
1008            2009         1     Suburb 2025-10-16         503      Stapler   

      quantity  unit_price 

In [5]:
sales_date_range = df_sales[
    (df_sales["date"] >= "2025-10-02") &
    (df_sales["date"] <= "2025-10-05")
]

print(sales_date_range.head())

   transaction_id  store_id store_name       date  product_id product_name  \
2            1003         2     Suburb 2025-10-02         501     Notebook   
3            1004         2     Suburb 2025-10-02         503      Stapler   
4            1005         1   Downtown 2025-10-03         504       Folder   
5            1006         3    Airport 2025-10-03         502          Pen   
6            1007         3    Airport 2025-10-04         505       Marker   

   quantity  unit_price  total  customer_id promo_code  
2         1        3.50   3.50       2003.0        NaN  
3         1        5.25   5.25          NaN        NaN  
4         3        1.20   3.60       2004.0     DISC10  
5        10        0.80   8.00       2005.0        NaN  
6         2        1.75   3.50       2006.0        NaN  


In [6]:
high_value_sales = df_sales[df_sales["total"] > 5.00]
print(high_value_sales)


      transaction_id  store_id store_name       date  product_id product_name  \
0               1001         1   Downtown 2025-10-01         501     Notebook   
3               1004         2     Suburb 2025-10-02         503      Stapler   
5               1006         3    Airport 2025-10-03         502          Pen   
10              1011         2     Suburb 2025-10-19         503      Stapler   
11              1012         2    Airport 2025-10-12         504       Folder   
...              ...       ...        ...        ...         ...          ...   
1005            2006         3   Downtown 2025-10-23         503      Stapler   
1006            2007         2    Airport 2025-10-23         505       Marker   
1007            2008         2    Airport 2025-10-01         501     Notebook   
1008            2009         1     Suburb 2025-10-16         503      Stapler   
1009            2010         2    Airport 2025-10-11         504       Folder   

      quantity  unit_price 

In [7]:
stats_summary = df_sales.describe()
print(stats_summary)


       transaction_id     store_id                           date  \
count     1010.000000  1010.000000                           1010   
mean      1505.500000     2.000000  2025-10-14 14:15:26.732673024   
min       1001.000000     1.000000            2025-10-01 00:00:00   
25%       1253.250000     1.000000            2025-10-08 00:00:00   
50%       1505.500000     2.000000            2025-10-15 00:00:00   
75%       1757.750000     3.000000            2025-10-21 00:00:00   
max       2010.000000     3.000000            2025-10-28 00:00:00   
std        291.706188     0.809997                            NaN   

        product_id     quantity   unit_price        total  customer_id  
count  1010.000000  1010.000000  1010.000000  1010.000000  1009.000000  
mean    503.388119     5.533663     7.494426    41.618673  3562.295342  
min     501.000000     1.000000     0.530000     0.530000  2001.000000  
25%     502.000000     3.000000     3.742500    13.435000  2805.000000  
50%     503.0

In [8]:
#Total Sales Amount by Store (Store-wise Revenue)
store_totals = df_sales.groupby("store_name")["total"].sum().reset_index()
print(store_totals)


  store_name     total
0    Airport  14752.36
1   Downtown  14327.69
2     Suburb  12954.81


In [9]:
#Average Basket Value (average spend per transaction)

#Basket value = total amount spent in one transaction

avg_basket = df_sales["total"].mean()
print("Average basket value:", avg_basket)

Average basket value: 41.618673267326734


In [10]:
#Average Basket Value by Store
avg_basket_store = df_sales.groupby("store_name")["total"].mean().reset_index()
print(avg_basket_store)

  store_name      total
0    Airport  43.389294
1   Downtown  42.641935
2     Suburb  38.786856


Task4: Perform simple cleaning (handle missing values, fix data types, drop duplicates)

In [11]:
#convert date to datetime
df_sales["date"] = pd.to_datetime(df_sales["date"], errors="coerce")

In [12]:
#Convert numeric fields correctly
df_sales["quantity"] = pd.to_numeric(df_sales["quantity"], errors="coerce")
df_sales["unit_price"] = pd.to_numeric(df_sales["unit_price"], errors="coerce")
df_sales["total"] = pd.to_numeric(df_sales["total"], errors="coerce")
df_sales["customer_id"] = pd.to_numeric(df_sales["customer_id"], errors="coerce")

In [13]:
#Missing quantity → assume minimum purchase = 1
df_sales["quantity"] = df_sales["quantity"].fillna(1)

In [14]:
#Missing unit_price → replace with median
df_sales["unit_price"] = df_sales["unit_price"].fillna(df_sales["unit_price"].median())

In [15]:
#Missing totals → recompute using quantity × unit_price
df_sales["total"] = df_sales["quantity"] * df_sales["unit_price"]

In [16]:
#Missing promo_code → replace with “NONE”
df_sales["promo_code"] = df_sales["promo_code"].fillna("NONE")

In [17]:
#Missing product_name / store_name → replace with “Unknown”
df_sales["product_name"] = df_sales["product_name"].fillna("Unknown Product")
df_sales["store_name"] = df_sales["store_name"].fillna("Unknown Store")

In [18]:
#Drop duplicates based on transaction_id:
df_sales.drop_duplicates(subset=["transaction_id"], inplace=True)


In [19]:
#Reset Index After Cleaning
df_sales.reset_index(drop=True, inplace=True)

In [20]:
#Print Summary After Cleaning
print("Cleaned dataset shape:", df_sales.shape)
print(df_sales.head())

Cleaned dataset shape: (1010, 11)
   transaction_id  store_id store_name       date  product_id product_name  \
0            1001         1   Downtown 2025-10-01         501     Notebook   
1            1002         1   Downtown 2025-10-01         502          Pen   
2            1003         2     Suburb 2025-10-02         501     Notebook   
3            1004         2     Suburb 2025-10-02         503      Stapler   
4            1005         1   Downtown 2025-10-03         504       Folder   

   quantity  unit_price  total  customer_id promo_code  
0         2        3.50   7.00       2001.0       NONE  
1         5        0.80   4.00       2002.0     DISC10  
2         1        3.50   3.50       2003.0       NONE  
3         1        5.25   5.25          NaN       NONE  
4         3        1.20   3.60       2004.0     DISC10  


Task 5:Build a mini ETL pipeline: read CSV → clean & transform → output JSON

In [21]:
def extract(csv_path):
    df = pd.read_csv(csv_path)
    return df




In [22]:
def transform(df):
    df = df.copy()

    # Fix data types
    df["date"] = pd.to_datetime(df["date"], errors="coerce")

    numeric_cols = ["quantity", "unit_price", "total", "customer_id"]
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    # Handle missing values
    df["quantity"] = df["quantity"].fillna(1)
    df["unit_price"] = df["unit_price"].fillna(df["unit_price"].median())
    df["promo_code"] = df["promo_code"].fillna("NONE")
    df["product_name"] = df["product_name"].fillna("Unknown Product")
    df["store_name"] = df["store_name"].fillna("Unknown Store")

    # Recompute totals
    df["computed_total"] = df["quantity"] * df["unit_price"]

    # High-value tag
    df["is_high_value"] = df["computed_total"] > 1000

    # Drop duplicates
    df.drop_duplicates(subset=["transaction_id"], inplace=True)

    df.reset_index(drop=True, inplace=True)

    return df




In [23]:
def load(df, output_path="clean_sales.json"):
    df.to_json(output_path, orient="records", indent=4, date_format="iso")
    print(f"JSON file created: {output_path}")


In [24]:
def run_etl():
    # Step 1: Extract
    df = extract("D:\python files\pandas\sample_expanded.csv")

    # Step 2: Transform
    clean_df = transform(df)

    # Step 3: Load
    load(clean_df, "D:\python files\pandas\clean_sales_extended.json")

    return clean_df

# Run the complete pipeline
final_df = run_etl()


JSON file created: D:\python files\pandas\clean_sales_extended.json


Task6:Complete a short hands-on test and an assignment to extend the pipeline.

In [25]:
#Which product was sold the most (total quantity)?
df_sales.groupby("product_name")["quantity"].sum().sort_values(ascending=False).head(1)


product_name
Stapler    1041
Name: quantity, dtype: int64

In [26]:
#How many transactions used a promo code?
df_sales[df_sales["promo_code"] != "NONE"].shape[0]

660

In [36]:
# Create the promo flag
df_sales["promo_used_flag"] = np.where(df_sales["promo_code"] != "NONE", 1, 0)

# Create the unique Customer Engagement Score(created by me-random creation)
df_sales["engagement_score"] = (
    df_sales["quantity"] * 0.4 +
    df_sales["total"] * 0.5 +
    df_sales["promo_used_flag"] * 2
).round(2)

print(" DataFrame with new Customer Engagement Score column:")
print(df_sales[["quantity", "total", "promo_code", "engagement_score"]].head())

# Save the updated file
df_sales.to_csv(r"D:\python files\pandas\sample_extended_with_CES.csv", index=False)

print("\n Saved: sample_extended_with_CES.csv")


 DataFrame with new Customer Engagement Score column:
   quantity  total promo_code  engagement_score
0         2   7.00       NONE              4.30
1         5   4.00     DISC10              6.00
2         1   3.50       NONE              2.15
3         1   5.25       NONE              3.02
4         3   3.60     DISC10              5.00

 Saved: sample_extended_with_CES.csv
