# 1️⃣ Column Renaming & Restructuring



In [None]:
import pandas as pd
df = pd.read_csv('/content/customer_transactions.csv')

# convert all column names to snake case
df.columns = df.columns.str.lower().str.replace(' ($)', '').str.replace(' (%)', '').str.replace(' ', '_')
df.columns

Index(['transaction_id', 'customer_id', 'product_category', 'product_name',
       'quantity', 'unit_price', 'total_price', 'discount_applied',
       'purchase_date', 'payment_method'],
      dtype='object')

In [None]:
# converting the date column to datetime format
df['purchase_date'] = pd.to_datetime(
    df['purchase_date'], format='mixed', dayfirst= False
    )

# converted payment methods to snakecase
df['payment_method'] = df['payment_method'].str.lower().str.replace(' ', '_')
df['payment_method'].unique()

array(['bank_transfer', 'credit_card', 'paypal'], dtype=object)

# 2️⃣ Creating & Modifying Columns

In [None]:
# Fill missing values in Total_Price by multiplying Quantity × Unit_Price.
new_total_price = df['quantity'] * df['unit_price']
df['total_price'].fillna(new_total_price, inplace=True)

# Fill missing values in Discount_Applied with 0%.
df['discount_applied'].fillna('0%', inplace=True)
df['discount_applied'] = df['discount_applied'].str.replace('%', '').astype(float)/100

# Create a new column Final_Price that applies the discount
df['final_price'] = df['total_price'] * (1-df['discount_applied'])

df['purchase_year'] = df['purchase_date'].dt.year
df['purchase_month'] = df['purchase_date'].dt.month_name()



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_price'].fillna(new_total_price, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['discount_applied'].fillna('0%', inplace=True)


# 3️⃣ Using `.apply()` & `lambda` Functions

In [None]:
# creating a purchase type column
df['purchase_type'] = df['quantity'].apply(
    lambda x: 'bulk' if x > 5 else 'single item'
    )

# categorizing unit price
df['price_category'] = df['unit_price'].apply(
    lambda price: 'Low' if price < 20 else ('Medium' if price <= 100 else 'High')
    )

# standardizing product category
df['product_category'] = df['product_category'].apply(lambda x: x.capitalize())

# flagging high value transactions
df['transaction_flag'] = df['final_price'].apply(lambda x: 'High Value' if x > 100 else 'Low Value')

In [None]:
df.head()

Unnamed: 0,transaction_id,customer_id,product_category,product_name,quantity,unit_price,total_price,discount_applied,purchase_date,payment_method,final_price,purchase_year,purchase_month,purchase_type,price_category
0,10001,2103,Toys,Action Figure,9,58.76,528.84,0.1,2023-07-16,bank_transfer,475.956,2023,July,bulk,Medium
1,10002,2436,Books,Non-Fiction,10,239.03,2390.3,0.05,2023-06-18,credit_card,2270.785,2023,June,bulk,High
2,10003,2861,Toys,Lego,5,340.18,1700.9,0.1,2024-02-07,bank_transfer,1530.81,2024,February,single item,High
3,10004,2271,Beauty,Skincare Set,7,437.88,3065.16,0.0,2024-11-12,bank_transfer,3065.16,2024,November,bulk,High
4,10005,2107,Beauty,Foundation,9,375.86,3382.74,0.0,2023-09-18,paypal,3382.74,2023,September,bulk,High


4️⃣ Reshaping Data (Pivot, Melt, Stack, Unstack)

In [86]:
# using the melt method
var_columns = ['unit_price', 'final_price', 'total_price']
id_columns = [col for col in df.columns if col not in var_columns]
melted_df = df.melt(
    id_vars=id_columns, value_vars=var_columns,
    var_name='price_type', value_name='price_value'
    )
melted_df.head()

Unnamed: 0,transaction_id,customer_id,product_category,product_name,quantity,discount_applied,purchase_date,payment_method,purchase_year,purchase_month,purchase_type,price_category,transaction_flag,price_type,price_value
0,10001,2103,Toys,Action Figure,9,0.1,2023-07-16,bank_transfer,2023,July,bulk,Medium,High Value,unit_price,58.76
1,10002,2436,Books,Non-Fiction,10,0.05,2023-06-18,credit_card,2023,June,bulk,High,High Value,unit_price,239.03
2,10003,2861,Toys,Lego,5,0.1,2024-02-07,bank_transfer,2024,February,single item,High,High Value,unit_price,340.18
3,10004,2271,Beauty,Skincare Set,7,0.0,2024-11-12,bank_transfer,2024,November,bulk,High,High Value,unit_price,437.88
4,10005,2107,Beauty,Foundation,9,0.0,2023-09-18,paypal,2023,September,bulk,High,High Value,unit_price,375.86


In [87]:
df.isna().sum()

Unnamed: 0,0
transaction_id,0
customer_id,0
product_category,0
product_name,0
quantity,0
unit_price,0
total_price,0
discount_applied,0
purchase_date,0
payment_method,0
