In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [None]:
dfn = pd.read_csv('final.csv', parse_dates=['event_time'])
df_err = pd.read_csv('sorted_users_with_ids.csv', parse_dates=['event_time'])

user_err = (
    df_err
    .groupby('user_id')
    .agg(
        num_tx      = ('order_id', 'count'),
        error_count = ('error_type', lambda x: (x != 0).sum())
    )
    .assign(error_rate=lambda d: d['error_count'] / d['num_tx'])
    .reset_index()[['user_id','error_rate']]
)

dfn = dfn.merge(user_err, on='user_id', how='left')
dfn['error_rate'] = dfn['error_rate'].fillna(0)

dfn = dfn.sort_values(['user_id','event_time'])
dfn['num_transactions'] = dfn.groupby('user_id')['order_id'].transform('count')
dfn['hour']             = dfn['event_time'].dt.hour
dfn['night_txn']        = dfn['hour'].between(0,5).astype(int)
dfn['day_Friday']       = (dfn['event_time'].dt.dayofweek == 4).astype(int)
dfn['day_Sunday']       = (dfn['event_time'].dt.dayofweek == 6).astype(int)
dfn['time_diff']        = dfn.groupby('user_id')['event_time'].diff()
burst_cum               = dfn['time_diff'].le(pd.Timedelta(hours=1)).groupby(dfn['user_id']).cummax()
dfn['burst_1h']         = burst_cum.astype(int)

dfn = dfn.drop(columns=['hour','time_diff'])

cols_to_drop = ['order_id','user_id','event_time','card_country']
to_drop     = [c for c in cols_to_drop if c in dfn.columns]
X           = dfn.drop(columns=to_drop + ['is_reliable_user'])
y           = dfn['is_reliable_user']

X = X.replace({'TRUE':1,'FALSE':0})
X = X.apply(pd.to_numeric, errors='coerce')
X = X.loc[:, ~X.columns.str.endswith('.1')]
bools = X.select_dtypes(include=['bool']).columns
X[bools] = X[bools].astype(int)

X = X.dropna()
y = y.loc[X.index]

X = sm.add_constant(X)
model = sm.Logit(y, X).fit()
print(model.summary())


         Current function value: 0.028782
         Iterations: 35




                           Logit Regression Results                           
Dep. Variable:       is_reliable_user   No. Observations:               565315
Model:                          Logit   Df Residuals:                   565280
Method:                           MLE   Df Model:                           34
Date:                Mon, 05 May 2025   Pseudo R-squ.:                  0.9547
Time:                        19:02:34   Log-Likelihood:                -16271.
converged:                      False   LL-Null:                   -3.5917e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                  coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------
const                                          21.5883      0.253     85.279      0.000      21.092      22.084
#                               

In [None]:
result = model.fit(disp=False)

def one_sided_test(res, var, alternative='greater'):
    b, se = res.params[var], res.bse[var]
    z     = b / se
    p2    = res.pvalues[var]
    if alternative=='greater':
        p1 = p2/2 if b>0 else 1-p2/2
    else:
        p1 = p2/2 if b<0 else 1-p2/2
    return b, se, z, p1

tests = [
    ('num_transactions','greater','H1: β_num_transactions > 0'),
    ('error_rate',       'less',   'H2: β_error_rate < 0'),
    ('night_txn',        'less',   'H3: β_night_txn < 0'),
    ('day_Friday',       'less',   'H4: β_day_Friday < 0'),
    ('day_Sunday',       'less',   'H4: β_day_Sunday < 0'),
    ('burst_1h',         'less',   'H5: β_burst_1h < 0'),
]

print(f"{'Hypotheses':<30}{'β̂':>8}{'SE':>8}{'z':>8}{'p₁':>10}{'OR':>10}")
print("-"*74)

for var, alt, label in tests:
    b, se, z_val, p1 = one_sided_test(result, var, alternative=alt)
    or_val = np.exp(b)
    print(f"{label:<30}{b:8.3f}{se:8.3f}{z_val:8.3f}{p1:10.4f}{or_val:10.3f}")

wald = result.wald_test('day_Friday = 0, day_Sunday = 0')
print("\nJoint Wald-test for H4 (Friday & Sunday):")
print(f"  χ² = {wald.statistic[0][0]:.2f}, p-value = {wald.pvalue:.4f}")

Hypotheses                          β̂      SE       z        p₁        OR
--------------------------------------------------------------------------
H1: β_num_transactions > 0       0.024   0.001  20.616    0.0000     1.025
H2: β_error_rate < 0           -60.460   0.460-131.459    0.0000     0.000
H3: β_night_txn < 0              0.084   0.034   2.477    0.9934     1.087
H4: β_day_Friday < 0             0.029   0.033   0.870    0.8078     1.029
H4: β_day_Sunday < 0             0.129   0.041   3.116    0.9991     1.138
H5: β_burst_1h < 0              -0.085   0.033  -2.598    0.0047     0.918

Joint Wald-test for H4 (Friday & Sunday):
  χ² = 9.73, p-value = 0.0077


