In [71]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("faviovaz/marketing-ab-testing")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/faviovaz/marketing-ab-testing?dataset_version_number=1...


100%|██████████| 5.23M/5.23M [00:00<00:00, 161MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/faviovaz/marketing-ab-testing/versions/1





## Data Exploration and Cleaning



In [72]:
import pandas as pd
import os

# The path to the dataset files obtained from the previous step
dataset_path = '/root/.cache/kagglehub/datasets/faviovaz/marketing-ab-testing/versions/1'

# Assuming the dataset is in a CSV file within this directory, let's find the CSV file
file_list = os.listdir(dataset_path)
csv_file = None
for file_name in file_list:
    if file_name.endswith('.csv'):
        csv_file = file_name
        break

if csv_file:
    full_path = os.path.join(dataset_path, csv_file)

    # Load the CSV into a pandas DataFrame
    df = pd.read_csv(full_path)

    # Display the first 5 rows of the DataFrame
    print("DataFrame loaded successfully. First 5 rows:")
    display(df.head())

    print("\nDataFrame Info:")
    df.info()

    print("\nMissing Values:")
    print(df.isnull().sum())

    print("\nUnique values in 'test group' column:")
    print(df['test group'].unique())

    print("\nUnique values in 'converted' column:")
    print(df['converted'].unique())

else:
    print("No CSV file found in the dataset directory.")

DataFrame loaded successfully. First 5 rows:


Unnamed: 0.1,Unnamed: 0,user id,test group,converted,total ads,most ads day,most ads hour
0,0,1069124,ad,False,130,Monday,20
1,1,1119715,ad,False,93,Tuesday,22
2,2,1144181,ad,False,21,Tuesday,18
3,3,1435133,ad,False,355,Tuesday,10
4,4,1015700,ad,False,276,Friday,14



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 588101 entries, 0 to 588100
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Unnamed: 0     588101 non-null  int64 
 1   user id        588101 non-null  int64 
 2   test group     588101 non-null  object
 3   converted      588101 non-null  bool  
 4   total ads      588101 non-null  int64 
 5   most ads day   588101 non-null  object
 6   most ads hour  588101 non-null  int64 
dtypes: bool(1), int64(4), object(2)
memory usage: 27.5+ MB

Missing Values:
Unnamed: 0       0
user id          0
test group       0
converted        0
total ads        0
most ads day     0
most ads hour    0
dtype: int64

Unique values in 'test group' column:
['ad' 'psa']

Unique values in 'converted' column:
[False  True]


## Calculate Conversion Rates

### Subtask:
Calculate and compare the conversion rates for the control and treatment groups.

In [73]:
# Calculate the conversion rate for each test group
# Conversion rate is the mean of the 'converted' column (True is treated as 1, False as 0)
conversion_rates = df.groupby('test group')['converted'].mean() * 100

print("Conversion Rates by Test Group:")
print(conversion_rates)

print("\nAssessment: Observe the difference in conversion rates between the 'ad' and 'psa' groups. This difference will be tested for statistical significance in the next step.")

Conversion Rates by Test Group:
test group
ad     2.554656
psa    1.785411
Name: converted, dtype: float64

Assessment: Observe the difference in conversion rates between the 'ad' and 'psa' groups. This difference will be tested for statistical significance in the next step.


## Perform Statistical Significance Test

### Subtask:
Conduct an appropriate statistical test (e.g., chi-squared test or z-test) to determine if the observed difference in conversion rates is statistically significant.

In [74]:
from scipy.stats import chi2_contingency
import pandas as pd

# Create a contingency table of the 'test group' and 'converted' columns
# The table should show the counts of converted and not converted users for each group
contingency_table = pd.crosstab(df['test group'], df['converted'])

print("Contingency Table (Group vs. Converted):")
display(contingency_table)

# Perform the chi-squared test for independence
# chi2_contingency returns: chi2 statistic, p-value, degrees of freedom, and expected frequencies
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-squared statistic: {chi2:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Degrees of freedom: {dof}")

# Assess the result based on a common significance level (alpha = 0.05)
alpha = 0.05

print("\nAssessment:")
if p_value < alpha:
    print(f"Since the p-value ({p_value:.4f}) is less than the significance level ({alpha}), we reject the null hypothesis.")
    print("Conclusion: The observed difference in conversion rates between the 'ad' and 'psa' groups is statistically significant.")
else:
    print(f"Since the p-value ({p_value:.4f}) is greater than or equal to the significance level ({alpha}), we fail to reject the null hypothesis.")
    print("Conclusion: The observed difference in conversion rates between the 'ad' and 'psa' groups is not statistically significant.")

Contingency Table (Group vs. Converted):


converted,False,True
test group,Unnamed: 1_level_1,Unnamed: 2_level_1
ad,550154,14423
psa,23104,420



Chi-squared statistic: 54.0058
P-value: 0.0000
Degrees of freedom: 1

Assessment:
Since the p-value (0.0000) is less than the significance level (0.05), we reject the null hypothesis.
Conclusion: The observed difference in conversion rates between the 'ad' and 'psa' groups is statistically significant.


## Create Report

### Subtask:
Generate a markdown report summarizing the A/B test, including the hypothesis, methodology, results, and conclusion.

In [75]:
# Create a markdown string that outlines the A/B test report
ab_test_report = f"""
# A/B Test Analysis Report

## 1. Introduction

This report summarizes the results of an A/B test conducted to determine if a new page design (represented by the 'ad' test group) leads to a statistically significant increase in conversion rate compared to the existing page design (represented by the 'psa' test group).

## 2. Hypothesis

The hypothesis being tested is:

*   **Null Hypothesis (H₀):** There is no statistically significant difference in conversion rates between the new page design ('ad' group) and the existing page design ('psa' group).
*   **Alternative Hypothesis (H₁):** There is a statistically significant difference in conversion rates between the new page design ('ad' group) and the existing page design ('psa' group).

## 3. Data and Methodology

The analysis was performed using the "marketing-ab-testing" dataset obtained from Kaggle. The dataset contains information on user interactions, including which test group they belonged to ('ad' or 'psa') and whether they converted.

The methodology followed includes:

*   **Data Loading and Initial Exploration:** The dataset was loaded into a pandas DataFrame. Initial checks were performed to understand the data structure, identify missing values, and confirm the presence of both test groups.
*   **Data Cleaning:** Missing values were inspected. No critical missing values were found in the essential columns ('test group', 'converted').
*   **Conversion Rate Calculation:** The conversion rate for each test group was calculated as the proportion of users in that group who converted.
*   **Statistical Significance Test:** A Chi-squared test for independence was conducted on a contingency table of 'test group' and 'converted' status to determine if the observed difference in conversion rates was statistically significant. A significance level (alpha) of 0.05 was used.

## 4. Results

### Conversion Rates

The conversion rates for each test group were calculated as follows:

*   **'ad' group (New Page):** {conversion_rates['ad']:.4f}%
*   **'psa' group (Existing Page):** {conversion_rates['psa']:.4f}%

The 'ad' group showed a higher conversion rate compared to the 'psa' group.

### Statistical Test Results

A Chi-squared test was performed to assess the statistical significance of the observed difference.

*   **Chi-squared Statistic:** {chi2:.4f}
*   **P-value:** {p_value:.4f}
*   **Degrees of Freedom:** {dof}

Using a significance level of 0.05, the p-value obtained from the Chi-squared test ({p_value:.4f}) is less than 0.05.

## 5. Conclusion

Based on the statistical analysis, we **reject the null hypothesis**.

The observed difference in conversion rates between the 'ad' group (new page design) and the 'psa' group (existing page design) is **statistically significant**.

The new page design resulted in a conversion rate of {conversion_rates['ad']:.4f}%, which is {conversion_rates['ad'] - conversion_rates['psa']:.4f}% higher than the existing page design's conversion rate of {conversion_rates['psa']:.4f}%.

**Recommendation:** Given the statistically significant increase in conversion rate, it is recommended to launch the new page design to all users.

## 6. Code

The Python code used for this analysis is available in the accompanying notebook. The key steps included data loading, data exploration, calculation of conversion rates using pandas, and performing the chi-squared test using `scipy.stats.chi2_contingency`.

"""

# Print the markdown report
print(ab_test_report)


# A/B Test Analysis Report

## 1. Introduction

This report summarizes the results of an A/B test conducted to determine if a new page design (represented by the 'ad' test group) leads to a statistically significant increase in conversion rate compared to the existing page design (represented by the 'psa' test group).

## 2. Hypothesis

The hypothesis being tested is:

*   **Null Hypothesis (H₀):** There is no statistically significant difference in conversion rates between the new page design ('ad' group) and the existing page design ('psa' group).
*   **Alternative Hypothesis (H₁):** There is a statistically significant difference in conversion rates between the new page design ('ad' group) and the existing page design ('psa' group).

## 3. Data and Methodology

The analysis was performed using the "marketing-ab-testing" dataset obtained from Kaggle. The dataset contains information on user interactions, including which test group they belonged to ('ad' or 'psa') and whether they co

# Task
Analyze the provided A/B testing dataset from "/root/.cache/kagglehub/datasets/faviovaz/marketing-ab-testing/versions/1" to determine the success of the 'ad' group compared to the 'psa' group. This analysis should include calculating conversion rates for each group, performing a statistical significance test (chi-squared or z-test) to determine if the difference in conversion rates is significant, estimating the potential revenue generated by the ads, and exploring other relevant characteristics of the groups if the data allows. Finally, consolidate all findings into a comprehensive report.

## Review and summarize group analysis and statistical significance

### Subtask:
Briefly recap the findings regarding the conversion rates of the 'ad' and 'psa' groups and the conclusion from the statistical significance test.


In [76]:
# Access the previously calculated conversion rates
ad_conversion_rate = conversion_rates['ad']
psa_conversion_rate = conversion_rates['psa']

# Access the p-value from the chi-squared test
# p_value variable is already available from the previous cell

# Summarize the conclusion based on the p-value and alpha = 0.05
alpha = 0.05
if p_value < alpha:
    statistical_conclusion = "The observed difference in conversion rates is statistically significant (p < 0.05). We reject the null hypothesis."
else:
    statistical_conclusion = "The observed difference in conversion rates is not statistically significant (p >= 0.05). We fail to reject the null hypothesis."

# Print a brief summary
print("--- Summary of Findings ---")
print(f"Conversion Rate (Ad Group): {ad_conversion_rate:.4f}%")
print(f"Conversion Rate (PSA Group): {psa_conversion_rate:.4f}%")
print(f"Statistical Test P-value: {p_value:.4f}")
print(f"Conclusion: {statistical_conclusion}")
print("-------------------------")

--- Summary of Findings ---
Conversion Rate (Ad Group): 2.5547%
Conversion Rate (PSA Group): 1.7854%
Statistical Test P-value: 0.0000
Conclusion: The observed difference in conversion rates is statistically significant (p < 0.05). We reject the null hypothesis.
-------------------------


## Estimate potential revenue from ads

### Subtask:
Analyze the data to estimate how much revenue the company could potentially make from the ads.


In [77]:
# Calculate the number of conversions in the 'ad' group
ad_conversions = df[df['test group'] == 'ad']['converted'].sum()

# Assume an average revenue per conversion (placeholder value)
# IMPORTANT: This is an assumption. A real analysis would require actual revenue data per conversion.
assumed_revenue_per_conversion = 10  # Example: $10 per conversion

# Estimate the total revenue generated by the 'ad' campaign
estimated_ad_revenue = ad_conversions * assumed_revenue_per_conversion

# Print the estimated total revenue
print(f"Number of conversions in 'ad' group: {ad_conversions}")
print(f"Assumed average revenue per conversion: ${assumed_revenue_per_conversion}")
print(f"Estimated total revenue from 'ad' campaign: ${estimated_ad_revenue:.2f}")

print("\nAssessment: This calculation provides a rough estimate of potential revenue based on a critical assumption about revenue per conversion. Actual revenue data would be needed for a precise calculation.")

Number of conversions in 'ad' group: 14423
Assumed average revenue per conversion: $10
Estimated total revenue from 'ad' campaign: $144230.00

Assessment: This calculation provides a rough estimate of potential revenue based on a critical assumption about revenue per conversion. Actual revenue data would be needed for a precise calculation.


## Further analyze groups

### Subtask:
Explore other characteristics or behaviors of the users in the 'ad' and 'psa' groups beyond just conversion rate, if the dataset contains relevant features (e.g., demographics, ad interactions beyond just conversion).


In [78]:
# Examine the available columns
print("Available columns in the DataFrame:")
print(df.columns)

print("\n" + "="*40 + "\n")

# Identify potential features for further analysis:
# 'total ads', 'most ads day', 'most ads hour' seem relevant to user interaction with ads.

# Analyze numerical features: 'total ads', 'most ads hour'
print("Descriptive Statistics for Numerical Features by Test Group:")
numerical_features = ['total ads', 'most ads hour']
for feature in numerical_features:
    print(f"\n--- {feature} ---")
    display(df.groupby('test group')[feature].describe())

print("\n" + "="*40 + "\n")

# Analyze categorical features: 'most ads day'
print("Value Counts for Categorical Features by Test Group:")
categorical_features = ['most ads day']
for feature in categorical_features:
    print(f"\n--- {feature} ---")
    # Calculate value counts for each group
    ad_group_counts = df[df['test group'] == 'ad'][feature].value_counts()
    psa_group_counts = df[df['test group'] == 'psa'][feature].value_counts()

    print("\nAd Group:")
    print(ad_group_counts)

    print("\nPSA Group:")
    print(psa_group_counts)

print("\nAssessment: Review the output to identify any significant differences in the distribution of 'total ads', 'most ads day', and 'most ads hour' between the 'ad' and 'psa' groups. This can provide insights into how the test groups interacted with the platform or ads.")

Available columns in the DataFrame:
Index(['Unnamed: 0', 'user id', 'test group', 'converted', 'total ads',
       'most ads day', 'most ads hour'],
      dtype='object')


Descriptive Statistics for Numerical Features by Test Group:

--- total ads ---


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
test group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ad,564577.0,24.823365,43.750456,1.0,4.0,13.0,27.0,2065.0
psa,23524.0,24.761138,42.86072,1.0,4.0,12.0,26.0,907.0



--- most ads hour ---


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
test group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ad,564577.0,14.4759,4.841808,0.0,11.0,14.0,18.0,23.0
psa,23524.0,14.304923,4.656239,0.0,11.0,14.0,18.0,23.0




Value Counts for Categorical Features by Test Group:

--- most ads day ---

Ad Group:
most ads day
Friday       88805
Monday       83571
Sunday       82332
Thursday     79077
Saturday     78802
Wednesday    77418
Tuesday      74572
Name: count, dtype: int64

PSA Group:
most ads day
Thursday     3905
Friday       3803
Monday       3502
Wednesday    3490
Sunday       3059
Tuesday      2907
Saturday     2858
Name: count, dtype: int64

Assessment: Review the output to identify any significant differences in the distribution of 'total ads', 'most ads day', and 'most ads hour' between the 'ad' and 'psa' groups. This can provide insights into how the test groups interacted with the platform or ads.


## Consolidate findings and report

### Subtask:
Summarize all the findings, including conversion rates, statistical significance, potential revenue estimates, and any other group analysis, into a clear report.


In [79]:
# Create a markdown string that outlines the A/B test report
ab_test_report = f"""
# A/B Test Analysis Report

## 1. Introduction

This report summarizes the results of an A/B test conducted to determine if a new page design (represented by the 'ad' test group) leads to a statistically significant increase in conversion rate compared to the existing page design (represented by the 'psa' test group).

## 2. Hypothesis

The hypothesis being tested is:

*   **Null Hypothesis (H₀):** There is no statistically significant difference in conversion rates between the new page design ('ad' group) and the existing page design ('psa' group).
*   **Alternative Hypothesis (H₁):** There is a statistically significant difference in conversion rates between the new page design ('ad' group) and the existing page design ('psa' group).

## 3. Data and Methodology

The analysis was performed using the "marketing-ab-testing" dataset obtained from Kaggle. The dataset contains information on user interactions, including which test group they belonged to ('ad' or 'psa') and whether they converted.

The methodology followed includes:

*   **Data Loading and Initial Exploration:** The dataset was loaded into a pandas DataFrame. Initial checks were performed to understand the data structure, identify missing values, and confirm the presence of both test groups.
*   **Data Cleaning:** Missing values were inspected. No critical missing values were found in the essential columns ('test group', 'converted').
*   **Conversion Rate Calculation:** The conversion rate for each test group was calculated as the proportion of users in that group who converted.
*   **Statistical Significance Test:** A Chi-squared test for independence was conducted on a contingency table of 'test group' and 'converted' status to determine if the observed difference in conversion rates was statistically significant. A significance level (alpha) of 0.05 was used.
*   **Other Group Analysis:** Additional exploration was performed on the distribution of 'total ads', 'most ads day', and 'most ads hour' for each group to understand other behavioral characteristics.
*   **Revenue Estimation:** An estimate of potential revenue from the 'ad' campaign was calculated based on the number of conversions and an assumed revenue per conversion.

## 4. Results

### Conversion Rates

The conversion rates for each test group were calculated as follows:

*   **'ad' group (New Page):** {conversion_rates['ad']:.4f}%
*   **'psa' group (Existing Page):** {conversion_rates['psa']:.4f}%

The 'ad' group showed a higher conversion rate compared to the 'psa' group.

### Statistical Test Results

A Chi-squared test was performed to assess the statistical significance of the observed difference.

*   **Chi-squared Statistic:** {chi2:.4f}
*   **P-value:** {p_value:.4f}
*   **Degrees of Freedom:** {dof}

Using a significance level of 0.05, the p-value obtained from the Chi-squared test ({p_value:.4f}) is less than 0.05.

### Estimated Revenue

Based on {ad_conversions} conversions in the 'ad' group and an assumed average revenue of ${assumed_revenue_per_conversion} per conversion:

*   **Estimated Total Revenue from 'ad' campaign:** ${estimated_ad_revenue:.2f}

**Note:** This is an estimate based on an assumption. Actual revenue data per conversion would be needed for a precise calculation.

### Other Group Analysis Findings

Analysis of 'total ads', 'most ads day', and 'most ads hour' between the 'ad' and 'psa' groups indicated that the distribution of these characteristics was broadly similar across both groups. This suggests that the groups were comparable in terms of general ad exposure patterns.

## 5. Conclusion

Based on the statistical analysis, we **reject the null hypothesis**.

The observed difference in conversion rates between the 'ad' group (new page design) and the 'psa' group (existing page design) is **statistically significant**. The 'ad' group had a conversion rate of {conversion_rates['ad']:.4f}%, which is {conversion_rates['ad'] - conversion_rates['psa']:.4f}% higher than the 'psa' group's conversion rate of {conversion_rates['psa']:.4f}%.

While other explored user characteristics like 'total ads', 'most ads day', and 'most ads hour' were similar between the groups, the significant difference in conversion rate strongly suggests that the new page design itself had a positive impact on conversions.

**Recommendation:** Given the statistically significant increase in conversion rate and the estimated potential revenue gains, it is recommended to launch the new page design to all users. Further analysis with actual revenue data and potential follow-up tests could provide even more refined insights.

## 6. Code

The Python code used for this analysis is available in the accompanying notebook. The key steps included data loading, data exploration, calculation of conversion rates using pandas, performing the chi-squared test using `scipy.stats.chi2_contingency`, and basic analysis of other user interaction features.

"""

# Print the markdown report
print(ab_test_report)


# A/B Test Analysis Report

## 1. Introduction

This report summarizes the results of an A/B test conducted to determine if a new page design (represented by the 'ad' test group) leads to a statistically significant increase in conversion rate compared to the existing page design (represented by the 'psa' test group).

## 2. Hypothesis

The hypothesis being tested is:

*   **Null Hypothesis (H₀):** There is no statistically significant difference in conversion rates between the new page design ('ad' group) and the existing page design ('psa' group).
*   **Alternative Hypothesis (H₁):** There is a statistically significant difference in conversion rates between the new page design ('ad' group) and the existing page design ('psa' group).

## 3. Data and Methodology

The analysis was performed using the "marketing-ab-testing" dataset obtained from Kaggle. The dataset contains information on user interactions, including which test group they belonged to ('ad' or 'psa') and whether they co

## Summary:

### Data Analysis Key Findings

*   The 'ad' group had a conversion rate of 2.5547%, which was higher than the 'psa' group's conversion rate of 1.7854%.
*   The difference in conversion rates between the 'ad' and 'psa' groups was found to be statistically significant (p-value = 0.0000), leading to the rejection of the null hypothesis.
*   Based on an assumed average revenue of \$10 per conversion, the estimated total revenue from the 'ad' campaign is \$144,230.00.
*   Analysis of 'total ads', 'most ads day', and 'most ads hour' showed similar distributions between the 'ad' and 'psa' groups, suggesting comparability in general ad exposure patterns.

### Insights or Next Steps

*   Given the statistically significant increase in conversion rate and the estimated potential revenue, launching the new page design ('ad' group) to all users is recommended.
*   Further analysis with actual revenue data per conversion is needed for a more precise revenue assessment.
