
Step 1.1 focused on understanding the dataset structure and validating experiment-related fields. We confirmed that each row represents a user-level observation, verified the presence of only two test groups (Ad vs PSA), ensured conversion was binary, and standardized data types for accurate downstream analysis.

In [2]:
import pandas as pd
import numpy as np


# Update path if needed
df = pd.read_csv("/Users/parveenkumarsharma/Documents/Ecom_project/A_B_testing/marketing_AB.csv")


In [3]:
# ================================================================
# STEP 1.1 ‚Äî DATA STRUCTURE UNDERSTANDING
# Objective:
#   - Understand dataset shape
#   - Validate schema & data types
#   - Confirm experiment structure
#   - Ensure dataset is ready for validation checks
# ================================================================


# -------------------------------
# 1Ô∏è‚É£ Import Required Libraries
# -------------------------------

import pandas as pd
import numpy as np

# Optional display settings for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.4f}'.format)


# -------------------------------
# 2Ô∏è‚É£ Load Dataset
# -------------------------------

print("Dataset Loaded Successfully ‚úÖ")


# -------------------------------
# 3Ô∏è‚É£ Basic Dataset Overview
# -------------------------------

print("\n================ DATASET SHAPE ================")
print(f"Number of rows (observations): {df.shape[0]}")
print(f"Number of columns (features): {df.shape[1]}")


print("\n================ COLUMN NAMES ================")
print(df.columns.tolist())


print("\n================ FIRST 5 ROWS ================")
print(df.head())


# -------------------------------------------------------
# 4Ô∏è‚É£ Data Types & Non-Null Count (Schema Validation)
# -------------------------------------------------------

print("\n================ DATA TYPES & INFO ================")
print(df.info())


# -------------------------------------------------------
# 5Ô∏è‚É£ Unique Values in Key Experiment Columns
# -------------------------------------------------------

print("\n================ UNIQUE VALUES CHECK ================")

print("\nTest Group Unique Values:")
print(df['test group'].unique())

print("\nConverted Unique Values:")
print(df['converted'].unique())


# -------------------------------------------------------
# 6Ô∏è‚É£ Basic Statistical Summary
# -------------------------------------------------------

print("\n================ NUMERICAL SUMMARY ================")
print(df.describe())


# -------------------------------------------------------
# 7Ô∏è‚É£ Quick Logical Sanity Checks
# -------------------------------------------------------

print("\n================ QUICK SANITY CHECKS ================")

# Check if any negative ad exposures
negative_ads = df[df['total ads'] < 0]
print(f"Users with negative ad exposure: {len(negative_ads)}")

# Check for extremely high ad exposure (potential outliers)
high_ads = df[df['total ads'] > df['total ads'].quantile(0.99)]
print(f"Users above 99th percentile ad exposure: {len(high_ads)}")

# Check distribution of test groups
print("\nTest Group Distribution:")
print(df['test group'].value_counts())

print("\nTest Group Percentage Distribution:")
print(df['test group'].value_counts(normalize=True) * 100)


# -------------------------------------------------------
# 8Ô∏è‚É£ Rename Columns for Cleaner Analysis (Best Practice)
# -------------------------------------------------------

# Rename columns for easier coding
df.columns = [
    "index",
    "user_id",
    "test_group",
    "converted",
    "total_ads",
    "most_ads_day",
    "most_ads_hour"
]

print("\nColumns Renamed for Clean Analysis ‚úÖ")


# -------------------------------------------------------
# 9Ô∏è‚É£ Convert Data Types Properly
# -------------------------------------------------------

# Convert test_group to categorical
df["test_group"] = df["test_group"].astype("category")

# Convert converted to integer (0/1)
df["converted"] = df["converted"].astype(int)

print("\nData Types After Conversion:")
print(df.dtypes)


# -------------------------------------------------------
# üîü Final Structure Confirmation
# -------------------------------------------------------

print("\n================ FINAL STRUCTURE CHECK ================")
print(df.head())
print(df.info())

Dataset Loaded Successfully ‚úÖ

Number of rows (observations): 588101
Number of columns (features): 7

['Unnamed: 0', 'user id', 'test group', 'converted', 'total ads', 'most ads day', 'most ads hour']

   Unnamed: 0  user id test group  converted  total ads most ads day  \
0           0  1069124         ad      False        130       Monday   
1           1  1119715         ad      False         93      Tuesday   
2           2  1144181         ad      False         21      Tuesday   
3           3  1435133         ad      False        355      Tuesday   
4           4  1015700         ad      False        276       Friday   

   most ads hour  
0             20  
1             22  
2             18  
3             10  
4             14  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 588101 entries, 0 to 588100
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Unnamed: 0     588101 non-null  int64 
 1   u