In [6]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

In [12]:
# Hypothesis H1: Total Number of Transactions → Higher Reliability

# 0. Load data
df = pd.read_csv('final.csv')

# 1. Compute total number of transactions per user
df['num_transactions'] = df.groupby('user_id')['order_id'].transform('count')

# 2. Prepare response and predictor variables
y = df['is_reliable_user']   # binary flag: 1 if reliable, 0 otherwise
X = df[['num_transactions']]
X = sm.add_constant(X)

# 3. Fit the logistic regression model
model8 = sm.Logit(y, X).fit(disp=False)

# 4. One-sided test for H1: β_num_transactions > 0
beta   = model8.params['num_transactions']
se     = model8.bse['num_transactions']
z_stat = beta / se
p_two  = model8.pvalues['num_transactions']
# one-sided p-value for β > 0
p_one  = (p_two / 2) if beta > 0 else (1 - p_two / 2)

print("H1 Results:")
print(f"β̂ = {beta:.4f}, SE = {se:.4f}, z = {z_stat:.4f}, one-sided p (β>0) = {p_one:.4f}")


H1 Results:
β̂ = 0.0630, SE = 0.0003, z = 243.6608, one-sided p (β>0) = 0.0000


In [14]:
# Hypothesis H2: Transaction Error Rate → Lower Reliability


# 1. Load transaction errors and parse timestamps
df_err = pd.read_csv('sorted_users_with_ids.csv', parse_dates=['event_time'])

# 2. Aggregate per user:
#    - num_transactions: total orders
#    - error_count: orders with error_type != 0
user_error = (
    df_err
    .groupby('user_id')
    .agg(
        num_transactions = ('order_id', 'count'),
        error_count      = ('error_type', lambda x: (x != 0).sum())
    )
    .reset_index()
)
user_error['error_rate'] = user_error['error_count'] / user_error['num_transactions']

# 3. Load main dataset with reliability flag
df_main = pd.read_csv('final.csv')

# 4. Build user-level dataset with is_reliable_user and error_rate
df_user = (
    df_main[['user_id', 'is_reliable_user']]
    .drop_duplicates('user_id')
    .merge(user_error[['user_id', 'error_rate']], on='user_id', how='left')
)
# Fill missing error_rate values with zero
df_user['error_rate'] = df_user['error_rate'].fillna(0)

# 5. Fit logistic regression model for H2 (test β_error_rate < 0)
y = df_user['is_reliable_user']
X = add_constant(df_user[['error_rate']])
model2 = sm.Logit(y, X).fit(disp=False)

# 6. Calculate coefficient, SE, z-stat, and one-sided p-value
beta   = model2.params['error_rate']
se     = model2.bse['error_rate']
z_stat = beta / se
p_two  = model2.pvalues['error_rate']
# One-sided p: p_two/2 if beta<0, else 1 - p_two/2
p_one  = (p_two / 2) if beta < 0 else (1 - p_two / 2)

print("H2 Results:")
print(f"β̂ = {beta:.4f}, SE = {se:.4f}, z = {z_stat:.4f}, one-sided p (β<0) = {p_one:.4f}")


H2 Results:
β̂ = -26.7018, SE = 0.3911, z = -68.2668, one-sided p (β<0) = 0.0000


In [13]:
# Hypothesis H3: Night-time Transactions (00:00–05:59) → Lower Reliability

# 1. Parse timestamps and create a binary night_txn flag
df['event_time'] = pd.to_datetime(df['event_time'])
df['hour'] = df['event_time'].dt.hour
df['night_txn'] = ((df['hour'] >= 0) & (df['hour'] < 6)).astype(int)

# 2. Prepare response (y) and predictor (X)
y = df['is_reliable_user']               # binary reliability indicator
X = df[['night_txn']]
X = sm.add_constant(X)

# 3. Fit logistic regression and perform one-sided test H1: β_night_txn < 0
model10 = sm.Logit(y, X).fit(disp=False)

beta   = model10.params['night_txn']
se     = model10.bse['night_txn']
z_stat = beta / se
p_two  = model10.pvalues['night_txn']
# one-sided p-value for H1: β < 0
p_one  = (p_two / 2) if beta < 0 else (1 - p_two / 2)

print("\nH3 Results:")
print(f"β̂ = {beta:.4f}, SE = {se:.4f}, z = {z_stat:.4f}, one-sided p (β<0) = {p_one:.4f}")



H3 Results:
β̂ = -0.0490, SE = 0.0065, z = -7.5699, one-sided p (β<0) = 0.0000


In [None]:
# Hypothesis H4: Weekend Transactions (Friday & Sunday) → Lower Reliability

# 0. Parse timestamps
df['event_time'] = pd.to_datetime(df['event_time'])

# 1. Create boolean flags for Friday and Sunday transactions
df['day_Friday'] = (df['event_time'].dt.day_name() == 'Friday').astype(int)
df['day_Sunday'] = (df['event_time'].dt.day_name() == 'Sunday').astype(int)

# 2. Compute total number of transactions per user
df['num_transactions'] = df.groupby('user_id')['order_id'].transform('count')

# 3. Prepare response (y) and predictors (X)
y = df['is_reliable_user']  # binary reliability flag
X = df[['day_Friday', 'day_Sunday', 'price', 'avg_price', 'num_transactions']]
X = sm.add_constant(X)

# 4. Fit the logistic regression model
model = sm.Logit(y, X).fit(disp=False)

# 5. One-sided tests for each day (H1: coefficient < 0)
for day in ['day_Friday', 'day_Sunday']:
    beta   = model.params[day]
    se     = model.bse[day]
    z_stat = beta / se
    p_two  = model.pvalues[day]
    # For H1: β < 0, one-sided p = p_two/2 if beta<0, else = 1 - p_two/2
    p_one  = (p_two / 2) if beta < 0 else (1 - p_two / 2)
    print(f"{day}: β̂ = {beta:.4f}, SE = {se:.4f}, z = {z_stat:.4f}, one-sided p = {p_one:.4f}")

# 6. Joint Wald test for H0: β_day_Friday = 0 and β_day_Sunday = 0
wald = model.wald_test('day_Friday = 0, day_Sunday = 0')
print(f"\nJoint Wald χ² = {wald.statistic[0][0]:.2f}, p-value = {wald.pvalue:.4f}")


day_Friday: β̂ = -0.8618, SE = 0.0075, z = -114.7895, one-sided p = 0.0000
day_Sunday: β̂ = -0.7492, SE = 0.0088, z = -84.7146, one-sided p = 0.0000

Joint Wald χ² = 16236.60, p-value = 0.0000




In [15]:
# Hypothesis H5: Transaction Bursts (1-hour) → Lower Reliability

# 1. Parse timestamps and sort each user’s transactions
df['event_time'] = pd.to_datetime(df['event_time'])
df = df.sort_values(['user_id', 'event_time'])

# 2. Compute time difference between consecutive orders per user
df['time_diff'] = df.groupby('user_id')['event_time'].diff()

# 3. Flag any user who ever places two orders within one hour
user_burst = (df['time_diff'] <= pd.Timedelta(hours=1)).groupby(df['user_id']).any()
df['burst_1h'] = df['user_id'].map(user_burst).astype(int)

# 4. Prepare response (y) and predictor (X)
X = df[['burst_1h']]
y = df['is_reliable_user']
X = sm.add_constant(X)

# 5. Fit the logistic regression model
model5 = sm.Logit(y, X).fit(disp=False)

# 6. Output results and one-sided test for β_burst_1h < 0
beta   = model5.params['burst_1h']
se     = model5.bse['burst_1h']
z_stat = beta / se
p_two  = model5.pvalues['burst_1h']
# one-sided p-value for H1: β < 0
p_one  = (p_two / 2) if beta < 0 else (1 - p_two / 2)

print("H5 Results:")
print(f"β̂ = {beta:.4f}, SE = {se:.4f}, z = {z_stat:.4f}, one-sided p (β<0) = {p_one:.4f}")


H5 Results:
β̂ = -0.8539, SE = 0.0061, z = -139.6221, one-sided p (β<0) = 0.0000
