In [2]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

# Logistic Regression

In [144]:
pd.set_option('display.max_columns', None)
df = pd.read_csv(r'C:\Users\lksja\Downloads\archive\Base.csv')

In [145]:
df.tail()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,employment_status,credit_risk_score,email_is_free,housing_status,phone_home_valid,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
999995,0,0.8,0.12469,-1,143,30,0.051348,-0.826239,AB,530,6732.602414,3010.048099,3095.754245,42,8,CA,305,1,BB,1,1,31,0,1500.0,0,INTERNET,16.96777,other,0,1,0,7
999996,0,0.9,0.824544,-1,193,30,0.009591,0.008307,AC,408,1574.293294,2716.495767,4286.08905,0,5,CA,235,0,BA,1,1,-1,1,1000.0,0,INTERNET,1.504109,macintosh,0,1,0,7
999997,0,0.8,0.140891,-1,202,10,0.059287,50.609995,AA,749,1258.864938,3601.322892,3103.891664,2,3,CA,195,1,BE,0,1,31,0,200.0,0,INTERNET,16.068595,other,0,1,0,7
999998,0,0.9,0.00248,52,3,30,0.023357,-1.313387,AB,707,7048.137128,6521.395012,3068.265084,7,8,CA,148,0,BD,0,1,1,0,200.0,0,INTERNET,1.378683,linux,1,1,0,7
999999,0,0.6,0.993391,-1,174,30,0.020422,14.942456,AA,655,3737.076479,3135.788094,3051.003293,14,8,CA,100,1,BB,0,1,15,1,200.0,0,INTERNET,1.947926,other,1,1,0,7


In [146]:
# One-Hot Encoding
df_encoded = pd.get_dummies(df, columns = ['payment_type', 'employment_status', 'device_os', 'housing_status'])

# Binary Encoding
df_encoded['source'] = df_encoded['source'].map({'INTERNET' : 1, 'TELEAPP' : 0})

In [147]:
X = df_encoded.iloc[:, 1:].to_numpy()
y = df_encoded['fraud_bool'].to_numpy()

In [148]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

for train_idx, test_idx in split.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

In [149]:
b_in = 10
w_in = np.array(np.random.rand(51))

In [239]:
def compute_cost(x, y, w, b):

    m = x.shape[0]
    cost = 0
    epsilon = 1e-15
    
    z = x @ w + b # Iloczyn skalarny dla każdej z obserwacji
    z = np.clip(z, -25, 25)
    f_wb = 1 / (1 + np.exp(-z.astype('float64')))
    loss = -y * np.log(f_wb + epsilon) - (1 - y) * np.log(1 - f_wb + epsilon)
    cost = np.sum(loss) / m
    
    return cost

In [245]:
compute_cost(X_train, y_train, w_in, b_in)

np.float64(24.724205912248554)

In [160]:
def compute_gradient(x, y, w, b):

    m = x.shape[0]

    z = x @ w + b
    z = np.clip(z, -25, 25)
    f_wb = 1/(1 + np.exp(-z.astype('float64')))
    error = f_wb - y

    dj_dw = 1/m * np.dot(x.T, error)
    dj_db = 1/m * np.sum(error)

    return dj_dw, dj_db

In [246]:
compute_gradient(X_train, y_train, w_final, b_final)

(array([-0.007561124992192829, -0.0043149937426349445,
        -0.06896624976813498, -1.266264998798084, -0.4496249995299969,
        -0.01152325081253529, -0.04639280097255699, -17.8797437281612,
        -57.23145113617352, -50.89594913710211, -52.48346673934126,
        -1.4849312474420546, -0.08216124986794478, -1.9550862481811766,
        -0.007297499992640358, -0.0027899999942076568,
        -0.009376249987701112, -0.1148862498492436, -0.0009024999969059587,
        -9.171524992857965, -0.0005524999996492895, -0.010907499986243503,
        -0.09061209741064949, -0.0038274999919482248,
        -0.011908749985829827, 0.0, -0.03931624995429711,
        -0.0014124999964167677, -0.004118749994831135,
        -0.004203749996479187, -0.0012924999983495384,
        -1.249999995955147e-06, -0.008913749989893572,
        -0.000963749998080802, -0.0008999999994760013,
        -9.999999963224359e-05, -5.374999968534604e-05,
        -8.999999938814249e-05, -7.499999993576887e-06,
        -0.00

In [261]:
def gradient_descent(x, y, w, b, gradient_function, cost_function, alpha, num_iter):

    cost_history = []

    for j in range(num_iter):
        dj_dw, dj_db = gradient_function(x, y, w, b)
        w = w - alpha * dj_dw
        b = b - alpha * dj_db

        if j % 10 == 0:
            cost = cost_function(x, y, w, b)
            cost_history.append(cost)
            print(f"Koszt iteracji nr.:{j} wyniósł: {cost}")
            
    return w, b, cost_history

In [262]:
num_iter = 101
alpha = 1

In [263]:
%%time
w_final, b_final, cost_history = gradient_descent(X_train, y_train, w_in2, b_in2, compute_gradient, compute_cost, alpha, num_iter)

Koszt iteracji nr.:0 wyniósł: 0.2757179559184424
Koszt iteracji nr.:10 wyniósł: 0.2757179559184424
Koszt iteracji nr.:20 wyniósł: 0.2757179559184424
Koszt iteracji nr.:30 wyniósł: 0.2757179559184424
Koszt iteracji nr.:40 wyniósł: 0.2757179559184424
Koszt iteracji nr.:50 wyniósł: 0.2757179559184424
Koszt iteracji nr.:60 wyniósł: 0.2757179559184424
Koszt iteracji nr.:70 wyniósł: 0.2757179559184424
Koszt iteracji nr.:80 wyniósł: 0.2757179559184424
Koszt iteracji nr.:90 wyniósł: 0.2757179559184424
Koszt iteracji nr.:100 wyniósł: 0.2757179559184424
CPU times: total: 6min 18s
Wall time: 6min 23s


In [264]:
def predictions(x, y, w, b):
    
    z = np.dot(x, w) + b
    probabilities = 1 / (1 + np.exp(-np.clip(z, -25, 25).astype('float64')))
    predictions = (probabilities >= 0.5).astype(int)
    accuracy = np.mean(predictions == y_test) * 100

    return accuracy

In [265]:
print(f"Dokładność modelu wynosi: {predictions(X_test, y_test, w_final, b_final):.2f}%") 

Dokładność modelu wynosi: 98.90%
