# Big Data Analytics - Homework - 1
  #Sushma Niveni Pindiga
    
    
    
#Exercise 1: Game Fun: Customer Acquisition through Digital Advertising




In [1]:
import pandas as pd
from scipy.stats import ttest_ind

# Load the data
data = pd.read_excel("GameFun.xlsx")

# Split the dataset into test and control groups
groups = data.groupby('test')

# Calculate mean values for selected columns
means = groups[['income', 'gender', 'gamer']].mean()

# Calculate the percentage difference in means
percentage_difference = (means.diff() / means.iloc[0]) * 100

# Conduct t-tests for selected variables and collect results
variables = ['income', 'gender', 'gamer']
ttest_results = {var: ttest_ind(data[data['test'] == 1][var], data[data['test'] == 0][var]) for var in variables}

# Print mean values and percentage differences
print("Mean Values:\n", means)
print("\nPercentage Difference:\n", percentage_difference.iloc[1])

# Check if the p-values indicate statistical significance and print results
for var, result in ttest_results.items():
    significance = "statistically significant" if result.pvalue < 0.05 else "not statistically significant"
    print(f"\nP-Value for {var}: {result.pvalue:.4f} - {var.capitalize()} differences are {significance}.")



Mean Values:
          income    gender     gamer
test                               
0     55.166012  0.647905  0.601823
1     54.938236  0.647289  0.601331

Percentage Difference:
 income   -0.412890
gender   -0.095049
gamer    -0.081720
Name: 1, dtype: float64

P-Value for income: 0.1284 - Income differences are not statistically significant.

P-Value for gender: 0.9060 - Gender differences are not statistically significant.

P-Value for gamer: 0.9267 - Gamer differences are not statistically significant.


In [17]:
def calculate_expected_revenue(data, revenue_per_purchase=37.5, group_by=None):
    # Assuming calculate_purchase_rates is a function that returns a DataFrame
    # with purchase rates for the specified groups or overall.
    purchase_rates = calculate_purchase_rates(data, group_by=group_by)
    
    if group_by:
        # If data is grouped, calculate expected revenue for each group.
        purchase_rates['Expected Revenue Test'] = purchase_rates[1] * revenue_per_purchase
        purchase_rates['Expected Revenue Control'] = purchase_rates[0] * revenue_per_purchase
    else:
        # For ungrouped data, calculate a single expected revenue value.
        purchase_rates['Expected Revenue'] = purchase_rates['Average Purchase Rate'] * revenue_per_purchase
        purchase_rates.drop('Average Purchase Rate', axis=1, inplace=True)
    
    return purchase_rates

# a. All customers
expected_revenue_all = calculate_expected_revenue(data)
print(expected_revenue_all)


   Expected Revenue
0          2.426151


In [10]:
print(data.columns)


Index(['id', 'test', 'purchase', 'site', 'impressions', 'income', 'gender',
       'gamer'],
      dtype='object')


In [23]:
def calculate_purchase_rates(data, group_by=None):
    if group_by:
        # Group by specified column and calculate the mean purchase rate
        purchase_rate = data.groupby(group_by).apply(lambda x: x['purchase'].mean())
        return pd.DataFrame({
            group_by: purchase_rate.index,
            'Test Purchase Rate': purchase_rate, 
            'Control Purchase Rate': purchase_rate 
        }).reset_index(drop=True)
    else:
        # Calculate the average purchase rate for the entire dataset
        average_purchase_rate = data['purchase'].mean()
        return pd.DataFrame({
            'Average Purchase Rate': [average_purchase_rate]
        })


In [24]:
def calculate_expected_revenue(data, revenue_per_purchase=37.5, group_by=None):
    purchase_rates = calculate_purchase_rates(data, group_by=group_by)
    
    if group_by:
        # Adjusted to calculate expected revenue correctly for grouped data
        purchase_rates['Expected Revenue Test'] = purchase_rates['Test Purchase Rate'] * revenue_per_purchase
        purchase_rates['Expected Revenue Control'] = purchase_rates['Control Purchase Rate'] * revenue_per_purchase
    else:
        # For ungrouped data, calculate a single expected revenue value
        purchase_rates['Expected Revenue'] = purchase_rates['Average Purchase Rate'] * revenue_per_purchase
    return purchase_rates


In [29]:
# Assuming 'gamer' is a column in 'data' where 1 indicates a gamer
data_gamers = data[data['gamer'] == 1]


#1.
# Calculate expected revenue for all customers
expected_revenue_all = calculate_expected_revenue(data)
print(expected_revenue_all)

   Average Purchase Rate  Expected Revenue
0               0.064697          2.426151


In [28]:
#2.
# Calculate expected revenue for female and male gamers, assuming 'gender' is the correct grouping column
expected_revenue_female_male_gamers = calculate_expected_revenue(data_gamers, group_by='gender')
print(expected_revenue_female_male_gamers)

   gender  Test Purchase Rate  Control Purchase Rate  Expected Revenue Test  \
0       0            0.086941               0.086941               3.260296   
1       1            0.082172               0.082172               3.081433   

   Expected Revenue Control  
0                  3.260296  
1                  3.081433  


In [33]:
import pandas as pd

def calculate_purchase_rates_optimized(data, group_by=None):
    if group_by:
        # Group by specified columns, calculate the mean purchase rate, and calculate the absolute difference
        grouped_data = data.groupby(group_by)['purchase'].mean().unstack()
        grouped_data['Absolute Difference'] = (grouped_data[1] - grouped_data[0]).abs()
        results = grouped_data
    else:
        # For ungrouped data, calculate the average purchase rate and the absolute difference
        test_group_means = data.groupby('test')['purchase'].mean()
        absolute_difference = abs(test_group_means[1] - test_group_means[0])
        results = pd.DataFrame(test_group_means, columns=['Average Purchase Rate'])
        results.loc['Absolute Difference', 'Average Purchase Rate'] = absolute_difference
    return results




In [32]:
 #a. All customers
all_customers = calculate_purchase_rates_optimized(data)

 #b. Male vs Female customers
male_female = calculate_purchase_rates_optimized(data, group_by=['gender', 'test'])

 #c. Gamers vs Non-Gamers Customers

gamers_non_gamers = calculate_purchase_rates_optimized(data, group_by=['gamer', 'test'])
data_gamers = data[data['gamer'] == 1]
 
    
#d. Female Gamers vs Male Gamers

female_male_gamers = calculate_purchase_rates_optimized(data_gamers, group_by=['gender', 'test'])

# Outputs
all_customers, male_female, gamers_non_gamers, female_male_gamers

(                    Average Purchase Rate
 Absolute Difference              0.040609,
 test           0         1  Absolute Difference
 gender                                         
 0       0.034442  0.080945             0.046503
 1       0.037176  0.074575             0.037399,
 test          0         1  Absolute Difference
 gamer                                         
 0      0.037387  0.035092             0.002295
 1      0.035436  0.104487             0.069051,
 test           0         1  Absolute Difference
 gender                                         
 0       0.032041  0.110092             0.078051
 1       0.037275  0.101404             0.064129)

In [36]:
#Exercise 2: Non-Compliance in Randomized Experiments
# Load  data
data_sd = pd.read_csv("sommer_deger.csv")

# Calculate the percentages
percent_died_offered = data_sd[data_sd['instrument'] == 1]['outcome'].mean() * 100
percent_died_not_offered = data_sd[data_sd['instrument'] == 0]['outcome'].mean() * 100

# Calculate the difference in mortality
mortality_difference = percent_died_offered - percent_died_not_offered

# Print the results
print(f"Percent of babies who died (mothers offered shots): {percent_died_offered:.2f}%")
print(f"Percent of babies who died (mothers not offered shots): {percent_died_not_offered:.2f}%")
print(f"Difference in mortality: {mortality_difference:.2f}%")


Percent of babies who died (mothers offered shots): 0.38%
Percent of babies who died (mothers not offered shots): 0.64%
Difference in mortality: -0.26%


In [38]:
# Calculate the percentages and difference in mortality 
percent_died_received = data_sd.loc[data_sd['treatment'] == 1, 'outcome'].mean() * 100
percent_died_not_received = data_sd.loc[data_sd['treatment'] == 0, 'outcome'].mean() * 100
mortality_difference_received = percent_died_received - percent_died_not_received

# Print the results 
print(f"Percent of babies who died (received shots): {percent_died_received:.2f}%")
print(f"Percent of babies who died (did not receive shots): {percent_died_not_received:.2f}%")
print(f"Difference in mortality: {mortality_difference_received:.2f}%")


Percent of babies who died (received shots): 0.12%
Percent of babies who died (did not receive shots): 0.77%
Difference in mortality: -0.65%


In [41]:
# Filtering for mothers offered Vitamin A shots
offered_subset = data_sd[data_sd['instrument'] == 1]

# Calculate the percentages
percent_died_received_offered = offered_subset[offered_subset['treatment'] == 1]['outcome'].mean() * 100
percent_died_not_received_offered = offered_subset[offered_subset['treatment'] == 0]['outcome'].mean() * 100

# Calculate the difference in mortality
mortality_difference_offered_subset = percent_died_received_offered - percent_died_not_received_offered

# Print the results
print(f"Percent of babies who died (received shots when offered): {percent_died_received_offered:.2f}%")
print(f"Percent of babies who died (offered but did not receive shots): {percent_died_not_received_offered:.2f}%")
print(f"Difference in mortality: {mortality_difference_offered_subset:.2f}%")


Percent of babies who died (received shots when offered): 0.12%
Percent of babies who died (offered but did not receive shots): 1.41%
Difference in mortality: -1.28%


In [40]:
import numpy as np

# The denominator likely involves the proportion of those who actually received the treatment among those offered
proportion_received_among_offered = data_sd[data_sd['instrument'] == 1]['treatment'].mean()

wald_estimator = (percent_died_offered - percent_died_not_offered) / proportion_received_among_offered

# Computing the intent-to-treat (ITT) effect for comparison
itt_effect = percent_died_offered - percent_died_not_offered

# Computing standard errors for offered and not offered groups to use in discussions of significance
n_offered = data_sd[data_sd['instrument'] == 1]['outcome'].count()
n_not_offered = data_sd[data_sd['instrument'] == 0]['outcome'].count()

std_err_offered = data_sd[data_sd['instrument'] == 1]['outcome'].std() / np.sqrt(n_offered)
std_err_not_offered = data_sd[data_sd['instrument'] == 0]['outcome'].std() / np.sqrt(n_not_offered)

# Print the results
print(f"Wald Estimator: {wald_estimator:.4f}")
print(f"ITT Effect: {itt_effect:.4f}%")
print(f"Standard Error (Offered Group): {std_err_offered:.4f}")
print(f"Standard Error (Not Offered Group): {std_err_not_offered:.4f}")


Wald Estimator: -0.3228
ITT Effect: -0.2582%
Standard Error (Offered Group): 0.0006
Standard Error (Not Offered Group): 0.0007
