## DATA FETCHING

--> This data is after being filtered by is_vpn check

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/processed/data_with_ts_vpn_removed.csv')
df = df.drop('Unnamed: 0', axis=1)


In [2]:
df.columns

Index(['CLNT_RMT_IP', 'BRWSR_NAME', 'AMS_TRANS_RSN_CD', 'CLICK_TS'], dtype='object')

In [None]:
df['status'] = np.where(df['AMS_TRANS_RSN_CD'] == 0, 1, 0)
df = df.drop('AMS_TRANS_RSN_CD', axis=1)


In [None]:
from user_agents import parse
import pandas as pd
import swifter  # for parallel processing with Pandas (install with `pip install swifter`)

# Dictionary to cache parsed user-agent strings and reduce redundant parsing
user_agent_cache = {}

def extract_user_agent_features_optimized(ua_string):
    # Check if this user-agent string has already been parsed
    if ua_string in user_agent_cache:
        return user_agent_cache[ua_string]
    
    # Parse the user agent string and store parsed results in a variable
    user_agent = parse(ua_string)
    
    # Extract required features
    result = {
        'browser': user_agent.browser.family,
        # 'browser_version': user_agent.browser.version_string,
        'os': user_agent.os.family,
        # 'os_version': user_agent.os.version_string,
        'device_type': (
    'mobile' if user_agent.is_mobile else
    'desktop' if user_agent.is_pc else
    'tablet' if user_agent.is_tablet else
    'bot' if user_agent.is_bot else 'other'
        )
    }
    
    # Store result in the cache
    user_agent_cache[ua_string] = result
    
    return result

# Apply optimized function to the user-agent column with parallel processing using swifter
df_user_agent_features = df['BRWSR_NAME'].swifter.apply(extract_user_agent_features_optimized).apply(pd.Series)


In [None]:
df = pd.concat([df, df_user_agent_features], axis=1)
df = df.drop('BRWSR_NAME', axis=1)


In [None]:
def categorize_browser(browser_column):
    # Define the browsers to keep based on the 10,000 row threshold
    popular_browsers = ['Chrome Mobile', 'Chrome', 'Mobile Safari', 'Mobile Safari UI/WKWebView', 'Edge', 'Firefox', 'Samsung Internet', 'Safari', 'Google', 'Chrome Mobile', 'Opera', 'FacebookBot', 'Facebook', 'ImagesiftBot']

    # Apply the categorization
    browser_column = browser_column.apply(lambda x: x if x in popular_browsers else 'Other')
    return browser_column

# Function to categorize OS based on the 1% rule and keeping till Ubuntu
def categorize_os(os_column):
    # Define the OSes to keep based on the row threshold (up to Ubuntu)
    popular_os = ['Android', 'Other', 'iOS', 'Windows', 'Mac OS X', 'Linux', 'Ubuntu', 'Chrome OS']

    # Apply the categorization
    os_column = os_column.apply(lambda x: x if x in popular_os else 'Other')
    return os_column

# Apply categorization functions to the DataFrame
df['browser'] = categorize_browser(df['browser'])
df['os'] = categorize_os(df['os'])

# Display the transformed DataFrame
print(df)

In [None]:
import re

# Function to check if the IP address is valid
def is_valid_ipv4(ip):
    if pd.isna(ip):  # Check for None or NaN
        return False
    pattern = r'^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
    return re.match(pattern, ip) is not None

# Apply the function and create a mask for valid IP addresses
valid_ips_mask = df['CLNT_RMT_IP'].apply(is_valid_ipv4)

# Drop rows with invalid IP addresses
df = df[valid_ips_mask]

import socket
import struct
from sklearn.preprocessing import OneHotEncoder


# 1. Convert IP to integer
def ipv4_to_int(ip):
    return struct.unpack("!I", socket.inet_aton(ip))[0]

df['ip_as_int'] = df['CLNT_RMT_IP'].apply(ipv4_to_int)
df['CLICK_TS'] = pd.to_datetime(df['CLICK_TS'])


# 3. Extract features from datetime
df['year'] = df['CLICK_TS'].dt.year
df['month'] = df['CLICK_TS'].dt.month
df['day'] = df['CLICK_TS'].dt.day
df['hour'] = df['CLICK_TS'].dt.hour
df['minute'] = df['CLICK_TS'].dt.minute
df['second'] = df['CLICK_TS'].dt.second
# df['day_of_week'] = df['datetime'].dt.dayofweek  # Monday=0, Sunday=6

# 4. Drop original date and time columns
df.drop(['CLNT_RMT_IP', 'CLICK_TS'], axis=1, inplace=True)

# 5. Encode categorical features
categorical_features = ['browser', 'os', 'device_type']
df_encoded = pd.get_dummies(df, columns=categorical_features)

# Display the processed DataFrame
print(df_encoded)

In [None]:
df_encoded.columns

In [None]:
# df.drop(['AMS_TRANS_RSN_CD'], axis=1, inplace=True)


cat_features = ['browser_Chrome', 'browser_Chrome Mobile', 'browser_Edge',
       'browser_Facebook', 'browser_FacebookBot', 'browser_Firefox',
       'browser_Google', 'browser_ImagesiftBot', 'browser_Mobile Safari',
       'browser_Mobile Safari UI/WKWebView', 'browser_Opera', 'browser_Other',
       'browser_Safari', 'browser_Samsung Internet', 'os_Android',
       'os_Chrome OS', 'os_Linux', 'os_Mac OS X', 'os_Other', 'os_Ubuntu',
       'os_Windows', 'os_iOS', 'device_type_bot', 'device_type_desktop',
       'device_type_mobile', 'device_type_other', 'device_type_tablet',
]

# Ensure all columns from training set are present in `df_encoded`
for col in cat_features:
    if col not in df_encoded.columns:
        df_encoded[col] = 0  # Add missing column with 0

# Reorder columns to match the training set
df_encoded = df_encoded[cat_features + [col for col in df_encoded.columns if col not in cat_features]]



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import joblib

# Step 1: Split df_encoded into X and y
# Assume the target column is 'target' (replace with the actual column name of the target in your df_encoded)
X_new = df_encoded.drop('status', axis=1)  # Features (excluding the target column)
y_new = df_encoded['status']  # Target (the actual labels)
X_new = X_new[feature_list_model]
# Replace 'model_filename.pkl' with the path to your saved model file
model = joblib.load('../models/random_forest_classifier.joblib')

# Step 2: Make predictions on new data
y_pred = model.predict(X_new)

# Step 3: Calculate evaluation metrics
accuracy = accuracy_score(y_new, y_pred)
precision = precision_score(y_new, y_pred, average='weighted')  # Adjust 'average' based on the type of classification problem
recall = recall_score(y_new, y_pred, average='weighted')  # Adjust 'average' based on the type of classification problem
f1 = f1_score(y_new, y_pred, average='weighted')  # Adjust 'average' based on the type of classification problem

# Print the scores
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


In [None]:

feature_list_model = ['ip_as_int', 'year', 'month', 'day', 'hour', 'minute', 'second',
       'browser_Chrome', 'browser_Chrome Mobile', 'browser_Edge',
       'browser_Facebook', 'browser_FacebookBot', 'browser_Firefox',
       'browser_Google', 'browser_ImagesiftBot', 'browser_Mobile Safari',
       'browser_Mobile Safari UI/WKWebView', 'browser_Opera', 'browser_Other',
       'browser_Safari', 'browser_Samsung Internet', 'os_Android',
       'os_Chrome OS', 'os_Linux', 'os_Mac OS X', 'os_Other', 'os_Ubuntu',
       'os_Windows', 'os_iOS', 'device_type_bot', 'device_type_desktop',
       'device_type_mobile', 'device_type_other', 'device_type_tablet']


In [3]:
import pandas as pd
import numpy as np
import re
from user_agents import parse
import swifter
import joblib
import socket
import struct
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the pre-trained model
model = joblib.load('../models/random_forest_classifier.joblib')

# Caching dictionary for user-agent parsing
user_agent_cache = {}

# Function to parse user-agent features
def extract_user_agent_features_optimized(ua_string):
    if ua_string in user_agent_cache:
        return user_agent_cache[ua_string]
    user_agent = parse(ua_string)
    result = {
        'browser': user_agent.browser.family,
        'os': user_agent.os.family,
        'device_type': (
            'mobile' if user_agent.is_mobile else
            'desktop' if user_agent.is_pc else
            'tablet' if user_agent.is_tablet else
            'bot' if user_agent.is_bot else 'other'
        )
    }
    user_agent_cache[ua_string] = result
    return result

# Function to categorize browsers
def categorize_browser(browser_column):
    popular_browsers = ['Chrome Mobile', 'Chrome', 'Mobile Safari', 'Mobile Safari UI/WKWebView', 
                        'Edge', 'Firefox', 'Samsung Internet', 'Safari', 'Google', 'Opera', 
                        'FacebookBot', 'Facebook', 'ImagesiftBot']
    return browser_column.apply(lambda x: x if x in popular_browsers else 'Other')

# Function to categorize operating systems
def categorize_os(os_column):
    popular_os = ['Android', 'Other', 'iOS', 'Windows', 'Mac OS X', 'Linux', 'Ubuntu', 'Chrome OS']
    return os_column.apply(lambda x: x if x in popular_os else 'Other')

# Function to check valid IPv4
def is_valid_ipv4(ip):
    if pd.isna(ip):
        return False
    pattern = r'^(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
    return re.match(pattern, ip) is not None

# Function to convert IP to integer
def ipv4_to_int(ip):
    return struct.unpack("!I", socket.inet_aton(ip))[0]

# Main pipeline function
# def process_and_predict(df):
#     # 1. Create 'status' column and drop 'AMS_TRANS_RSN_CD'
#     df['status'] = np.where(df['AMS_TRANS_RSN_CD'] == 0, 1, 0)
#     df.drop('AMS_TRANS_RSN_CD', axis=1, inplace=True)
    
#     # 2. Parse and add user-agent features
#     df_user_agent_features = df['BRWSR_NAME'].swifter.apply(extract_user_agent_features_optimized).apply(pd.Series)
#     df = pd.concat([df, df_user_agent_features], axis=1)
#     df.drop('BRWSR_NAME', axis=1, inplace=True)
    
#     # 3. Categorize browser and os
#     df['browser'] = categorize_browser(df['browser'])
#     df['os'] = categorize_os(df['os'])
    
#     # 4. Filter rows with valid IPs
#     valid_ips_mask = df['CLNT_RMT_IP'].apply(is_valid_ipv4)
#     df = df[valid_ips_mask]
    
#     # 5. Convert IP to integer
#     df['ip_as_int'] = df['CLNT_RMT_IP'].apply(ipv4_to_int)
    
#     # 6. Extract datetime features and drop original columns
#     df['CLICK_TS'] = pd.to_datetime(df['CLICK_TS'])
#     df['year'] = df['CLICK_TS'].dt.year
#     df['month'] = df['CLICK_TS'].dt.month
#     df['day'] = df['CLICK_TS'].dt.day
#     df['hour'] = df['CLICK_TS'].dt.hour
#     df['minute'] = df['CLICK_TS'].dt.minute
#     df['second'] = df['CLICK_TS'].dt.second
#     df.drop(['CLNT_RMT_IP', 'CLICK_TS'], axis=1, inplace=True)
    
#     # 7. One-hot encode categorical features
#     categorical_features = ['browser', 'os', 'device_type']
#     df_encoded = pd.get_dummies(df, columns=categorical_features)
    
#     # 8. Ensure all expected columns are present
#     cat_features = [
#         'browser_Chrome', 'browser_Chrome Mobile', 'browser_Edge', 'browser_Facebook', 'browser_FacebookBot', 
#         'browser_Firefox', 'browser_Google', 'browser_ImagesiftBot', 'browser_Mobile Safari', 
#         'browser_Mobile Safari UI/WKWebView', 'browser_Opera', 'browser_Other', 'browser_Safari', 
#         'browser_Samsung Internet', 'os_Android', 'os_Chrome OS', 'os_Linux', 'os_Mac OS X', 'os_Other', 
#         'os_Ubuntu', 'os_Windows', 'os_iOS', 'device_type_bot', 'device_type_desktop', 'device_type_mobile', 
#         'device_type_other', 'device_type_tablet'
#     ]
#     for col in cat_features:
#         if col not in df_encoded.columns:
#             df_encoded[col] = 0
#     df_encoded = df_encoded[cat_features + [col for col in df_encoded.columns if col not in cat_features]]
    
#     # 9. Select features for prediction
#     feature_list_model = [
#         'ip_as_int', 'year', 'month', 'day', 'hour', 'minute', 'second',
#         'browser_Chrome', 'browser_Chrome Mobile', 'browser_Edge', 'browser_Facebook', 
#         'browser_FacebookBot', 'browser_Firefox', 'browser_Google', 'browser_ImagesiftBot', 
#         'browser_Mobile Safari', 'browser_Mobile Safari UI/WKWebView', 'browser_Opera', 'browser_Other', 
#         'browser_Safari', 'browser_Samsung Internet', 'os_Android', 'os_Chrome OS', 'os_Linux', 
#         'os_Mac OS X', 'os_Other', 'os_Ubuntu', 'os_Windows', 'os_iOS', 'device_type_bot', 
#         'device_type_desktop', 'device_type_mobile', 'device_type_other', 'device_type_tablet'
#     ]
#     X_new = df_encoded[feature_list_model]
#     y_new = df_encoded['status']
    
#     # 10. Predict status
#     y_pred = model.predict(X_new)
#     df['predicted_status'] = y_pred
    
#     # 11. Calculate evaluation metrics
#     accuracy = accuracy_score(y_new, y_pred)
#     precision = precision_score(y_new, y_pred, average='weighted')
#     recall = recall_score(y_new, y_pred, average='weighted')
#     f1 = f1_score(y_new, y_pred, average='weighted')
    
#     print(f'Accuracy: {accuracy:.4f}')
#     print(f'Precision: {precision:.4f}')
#     print(f'Recall: {recall:.4f}')
#     print(f'F1 Score: {f1:.4f}')
    
#     # Return DataFrame with predicted status
#     return df

def process_and_predict(df):
    # 1. Create 'status' column and drop 'AMS_TRANS_RSN_CD'
    df['status'] = np.where(df['AMS_TRANS_RSN_CD'] == 0, 1, 0)
    df.drop('AMS_TRANS_RSN_CD', axis=1, inplace=True)
    
    # 2. Parse and add user-agent features
    df_user_agent_features = df['BRWSR_NAME'].swifter.apply(extract_user_agent_features_optimized).apply(pd.Series)
    df = pd.concat([df, df_user_agent_features], axis=1)
    df.drop('BRWSR_NAME', axis=1, inplace=True)
    
    # 3. Categorize browser and os
    df['browser'] = categorize_browser(df['browser'])
    df['os'] = categorize_os(df['os'])
    
    # 4. Filter rows with valid IPs
    valid_ips_mask = df['CLNT_RMT_IP'].apply(is_valid_ipv4)
    df = df[valid_ips_mask]
    
    # 5. Convert IP to integer
    df['ip_as_int'] = df['CLNT_RMT_IP'].apply(ipv4_to_int)
    
    # 6. Extract datetime features and drop original columns
    df['CLICK_TS'] = pd.to_datetime(df['CLICK_TS'])
    df['year'] = df['CLICK_TS'].dt.year
    df['month'] = df['CLICK_TS'].dt.month
    df['day'] = df['CLICK_TS'].dt.day
    df['hour'] = df['CLICK_TS'].dt.hour
    df['minute'] = df['CLICK_TS'].dt.minute
    df['second'] = df['CLICK_TS'].dt.second
    df.drop(['CLNT_RMT_IP', 'CLICK_TS'], axis=1, inplace=True)
    
    # 7. One-hot encode categorical features
    categorical_features = ['browser', 'os', 'device_type']
    df_encoded = pd.get_dummies(df, columns=categorical_features)
    
    # 8. Ensure all expected columns are present
    cat_features = [
        'browser_Chrome', 'browser_Chrome Mobile', 'browser_Edge', 'browser_Facebook', 'browser_FacebookBot', 
        'browser_Firefox', 'browser_Google', 'browser_ImagesiftBot', 'browser_Mobile Safari', 
        'browser_Mobile Safari UI/WKWebView', 'browser_Opera', 'browser_Other', 'browser_Safari', 
        'browser_Samsung Internet', 'os_Android', 'os_Chrome OS', 'os_Linux', 'os_Mac OS X', 'os_Other', 
        'os_Ubuntu', 'os_Windows', 'os_iOS', 'device_type_bot', 'device_type_desktop', 'device_type_mobile', 
        'device_type_other', 'device_type_tablet'
    ]
    for col in cat_features:
        if col not in df_encoded.columns:
            df_encoded[col] = 0
    df_encoded = df_encoded[cat_features + [col for col in df_encoded.columns if col not in cat_features]]
    
    # 9. Select features for prediction
    feature_list_model = [
        'ip_as_int', 'year', 'month', 'day', 'hour', 'minute', 'second',
        'browser_Chrome', 'browser_Chrome Mobile', 'browser_Edge', 'browser_Facebook', 
        'browser_FacebookBot', 'browser_Firefox', 'browser_Google', 'browser_ImagesiftBot', 
        'browser_Mobile Safari', 'browser_Mobile Safari UI/WKWebView', 'browser_Opera', 'browser_Other', 
        'browser_Safari', 'browser_Samsung Internet', 'os_Android', 'os_Chrome OS', 'os_Linux', 
        'os_Mac OS X', 'os_Other', 'os_Ubuntu', 'os_Windows', 'os_iOS', 'device_type_bot', 
        'device_type_desktop', 'device_type_mobile', 'device_type_other', 'device_type_tablet'
    ]
    X_new = df_encoded[feature_list_model]
    y_new = df_encoded['status']
    
    # 10. Predict status
    y_pred = model.predict(X_new)
    df['predicted_status'] = y_pred
    
    # 11. Calculate evaluation metrics and classification report
    accuracy = accuracy_score(y_new, y_pred)
    precision = precision_score(y_new, y_pred, average='weighted')
    recall = recall_score(y_new, y_pred, average='weighted')
    f1 = f1_score(y_new, y_pred, average='weighted')
    class_report = classification_report(y_new, y_pred, target_names=['Not Fraud', 'Fraud'])

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print("\nClassification Report:")
    print(class_report)
    
    # Return DataFrame with predicted status
    return df

# Usage example
# df = pd.read_csv("path_to_your_data.csv")
df_with_predictions = process_and_predict(df)


Pandas Apply:   0%|          | 0/21377 [00:00<?, ?it/s]

Accuracy: 0.9416
Precision: 0.9388
Recall: 0.9416
F1 Score: 0.9380

Classification Report:
              precision    recall  f1-score   support

   Not Fraud       0.86      0.65      0.74      2739
       Fraud       0.95      0.98      0.97     18619

    accuracy                           0.94     21358
   macro avg       0.91      0.82      0.85     21358
weighted avg       0.94      0.94      0.94     21358



---

**"In simple terms, this report helps us understand how well our model is able to identify fraudulent activity and non-fraudulent activity (called 'Not Fraud' here). Let's break down what each of these numbers means:"**

1. **Accuracy (94.16%)**:
   - **What it means**: Out of all the cases we tested, about 94% of the time, the model correctly identified whether a transaction was fraudulent or not. Accuracy is an overall measure, showing the percentage of correct predictions.
   - **Why it matters**: A high accuracy means that most of the time, the model gets it right, which is important because we want to avoid mistakenly flagging good users as fraudulent or letting fraud slip through.

2. **Precision (93.88%)**:
   - **What it means**: When the model predicts a transaction as fraud, it's correct about 94% of the time.
   - **Why it matters**: Precision is important for minimizing "false alarms" or mistakenly labeling a legitimate transaction as fraud. High precision means that when we call something fraud, it's very likely to actually be fraud.

3. **Recall (94.16%)**:
   - **What it means**: This shows that the model is catching about 94% of all actual fraud cases.
   - **Why it matters**: High recall is crucial because missing a fraudulent transaction can be costly. This tells us the model is good at catching most of the fraud cases that happen.

4. **F1 Score (93.80%)**:
   - **What it means**: The F1 Score is a combined measure that considers both precision and recall. Here, it's close to 94%, meaning the model is good at both catching fraud and avoiding mistakes.
   - **Why it matters**: F1 score is a useful overall measure when there's a trade-off between catching fraud and avoiding false alarms, giving us an idea of the balance between precision and recall.

5. **Classification Report Breakdown**:
   - **For 'Not Fraud' (legitimate transactions)**:
     - Precision: 86% — When the model says a transaction is 'Not Fraud', it's correct 86% of the time.
     - Recall: 65% — Out of all actual legitimate transactions, the model correctly identifies 65%.
   - **For 'Fraud'**:
     - Precision: 95% — When it says a transaction is fraud, it's right 95% of the time.
     - Recall: 98% — Out of all actual fraudulent transactions, it correctly identifies 98%.

---

**"In summary, this report shows that the model does a very good job at identifying fraud cases accurately (98% recall for fraud) while also being careful with false alarms (high precision). However, it could still make some mistakes with non-fraudulent cases. So, overall, this model can be trusted to flag fraud reliably, but with some room to improve on avoiding incorrect fraud alerts for genuine users."**

---

In [9]:
df_with_predictions.tail()

Unnamed: 0,status,browser,os,device_type,ip_as_int,year,month,day,hour,minute,second,predicted_status
21372,1,Mobile Safari,iOS,mobile,1443205945,2024,10,22,10,18,52,1
21373,0,Chrome Mobile,Android,mobile,1676284053,2024,10,22,10,19,31,1
21374,1,Other,Android,mobile,1518545074,2024,10,22,10,21,19,1
21375,1,Chrome Mobile,Android,mobile,2952957987,2024,10,22,10,21,32,1
21376,1,Edge,Windows,desktop,1471780084,2024,10,22,10,23,29,1


In [10]:
df_with_predictions.shape

(21358, 12)