In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Description:
This dataset captures transaction patterns and behaviors that could indicate potential fraud in card transactions. The data is composed of several features designed to reflect the transactional context such as geographical location, transaction medium, and spending behavior relative to the user's history.

## Attribute Description:
1. **distance_from_home:**  This is a numerical feature representing the geographical distance in kilometers between the transaction location and the cardholder's home address.
2. **distance_from_last_transaction:** This numerical attribute measures the distance in kilometers from the location of the last transaction to the current transaction location.
3. **ratio_to_median_purchase_price:** A numeric ratio that compares the transaction's price to the median purchase price of the user's transaction history.
4. **repeat_retailer:** A binary attribute where '1' signifies that the transaction was conducted at a retailer previously used by the cardholder, and '0' indicates a new retailer.
5. **used_chip:** This binary feature indicates whether the transaction was made using a chip (1) or not (0).
6. **used_pin_number:** Another binary feature, where '1' signifies the use of a PIN number for the transaction, and '0' shows no PIN number was used.
7. **online_order:** This attribute identifies whether the purchase was made online ('1') or offline ('0').
8. **fraud:** A binary target variable indicating whether the transaction was fraudulent ('1') or not ('0').

In [None]:
# A Parquet file is a column-oriented, open-source data storage format that's used to store flat columnar data.
# It's known for its efficient compression and encoding, and is designed to handle large volumes of complex data.

# Reading credit card transaction data from a Parquet file  https://www.openml.org/search?type=data&status=active&id=45955
fraud_tx_data_df = pd.read_parquet("https://data.openml.org/datasets/0004/45955/dataset_45955.pq")

# Drop the null/na data
fraud_tx_data_df.dropna(inplace=True)

## Save / Backup the data to a csv file in case the data / api is removed from the openml site.
# fraud_tx_data_df.to_csv("credit-card-transaction-data.csv")
fraud_tx_data_df.head()


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1,1,0,0,0
1,10.829943,0.175592,1.294219,1,0,0,0,0
2,5.091079,0.805153,0.427715,1,0,0,1,0
3,2.247564,5.600044,0.362663,1,1,0,1,0
4,44.190936,0.566486,2.222767,1,1,0,1,0


### Based on the dataset this is a Binary Data set. The feature "fraud" provides if its a fraud data.

In [None]:
# Get information on the data set
fraud_tx_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  uint8  
 4   used_chip                       1000000 non-null  uint8  
 5   used_pin_number                 1000000 non-null  uint8  
 6   online_order                    1000000 non-null  uint8  
 7   fraud                           1000000 non-null  uint8  
dtypes: float64(3), uint8(5)
memory usage: 27.7 MB


In [None]:
# Column fraud is the one to predict.
# 0 - no fraud
# 1 - fraud
fraud_tx_data_df["fraud"].value_counts()

Unnamed: 0_level_0,count
fraud,Unnamed: 1_level_1
0,912597
1,87403


### The data seems to be imbalance based on the distribution of the above count.
#### To-Do check what should be done for imbalance dataset

### Trying with Logistic Regression.

### Create the training data set for X and y

In [None]:
# Assign the fraud feature to the y column
y = fraud_tx_data_df["fraud"]
y[:10]

Unnamed: 0,fraud
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [None]:
# Make a copy of the all the features and drop the y feature.
X = fraud_tx_data_df.copy()
X.drop("fraud", axis=1, inplace=True)
X[:10]

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
0,57.877857,0.31114,1.94594,1,1,0,0
1,10.829943,0.175592,1.294219,1,0,0,0
2,5.091079,0.805153,0.427715,1,0,0,1
3,2.247564,5.600044,0.362663,1,1,0,1
4,44.190936,0.566486,2.222767,1,1,0,1
5,5.586408,13.261073,0.064768,1,0,0,0
6,3.724019,0.956838,0.278465,1,0,0,1
7,4.848247,0.320735,1.27305,1,0,1,0
8,0.876632,2.503609,1.516999,0,0,0,0
9,8.839047,2.970512,2.361683,1,0,0,1


In [None]:
# Split the data into training and testing sets using random_state=1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

### Scale the data using the Standard Scalar, the X data and scale the data

In [None]:
# Create the Standard Scalar Object
scaler = StandardScaler()
scaler

In [None]:
# Fit the scaler
scaler.fit_transform(X_train)

# Scale the X train data
X_train_scaled = scaler.transform(X_train)
X_train_scaled[:4]

array([[-0.22139338,  0.02528477, -0.57679666,  0.36643627, -0.73517181,
        -0.33490642,  0.73255692],
       [-0.08641302, -0.09726048,  0.12149586,  0.36643627,  1.36022626,
         2.98590872,  0.73255692],
       [-0.10001875, -0.16563608, -0.23506724,  0.36643627, -0.73517181,
        -0.33490642, -1.36508164],
       [-0.35241603, -0.20197118, -0.21196995,  0.36643627, -0.73517181,
        -0.33490642,  0.73255692]])

In [None]:
# Scale the X train test
X_test_scaled = scaler.transform(X_test)
X_test_scaled[:4]

array([[-0.05245033, -0.04722844, -0.4396794 ,  0.36643627,  1.36022626,
        -0.33490642,  0.73255692],
       [ 0.94522164, -0.15615665, -0.02029087,  0.36643627,  1.36022626,
        -0.33490642, -1.36508164],
       [ 0.21656128,  1.77174885, -0.61811477,  0.36643627,  1.36022626,
        -0.33490642, -1.36508164],
       [ 0.245146  , -0.18439965,  0.23950017,  0.36643627,  1.36022626,
         2.98590872,  0.73255692]])

### Model and Fit the Data to a Logistic Regresssion:

In [None]:
# Create Logistic Regression Model and random state = 3 and max iteration to 100
lr = LogisticRegression(random_state=3, max_iter=1000)
lr

In [None]:
# Fit and save the logistic regression model using the training the data
lr_model = lr.fit(X_train_scaled, y_train)

In [None]:
# Score the model
print(f"Training Data Score: {round(lr_model.score(X_train_scaled, y_train),2)}")
print(f"Testing Data Score: {round(lr_model.score(X_test_scaled, y_test),2)}")

Training Data Score: 0.96
Testing Data Score: 0.96


In [None]:
# Make and save testing predictions with the trained Logistic Regression model using the test data
testing_predictions = lr.predict(X_test_scaled)

# Review the predictions
testing_predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [None]:
# Calculate the accuracy score by evaluating `y_test` vs. `testing_predictions`.
accuracy_score(y_test, testing_predictions)

0.958576

### Model and Fit the Data to a Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(random_state=3)
rfc.fit(X_train_scaled, y_train)

In [None]:
# Make and save testing predictions with the trained Random Forest Classifier model using the test data
testing_predictions = rfc.predict(X_test_scaled)

# Review the predictions
testing_predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [None]:
# Calculate the accuracy score by evaluating `y_test` vs. `testing_predictions`.
accuracy_score(y_test, testing_predictions)

0.999992