LOGISTIC REGRESSION; SKLEARN; CLEANED DATA

In [1]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/Fraud.csv')

# Encode categorical variables using LabelEncoder
encoder = {}
for col in df.select_dtypes('object').columns:
    encoder[col] = LabelEncoder()
    df[col] = encoder[col].fit_transform(df[col])

# Separate features (x) and target (y)
maskFraud = df['isFraud'] == 1

fraud = df[maskFraud]
non_fraud = df[maskFraud == False]

df_balanced = pd.concat([fraud, non_fraud.sample(len(fraud), random_state= 42)])

y = df_balanced['isFraud' ]

columns_dropped = ["step", "nameOrig", "nameDest", "oldbalanceDest", "newbalanceDest", "isFraud", "isFlaggedFraud"]

df_balanced.drop(columns = columns_dropped, inplace = True)

X = df_balanced



# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(x_train, y_train)

# # Evaluate the model
# accuracy, precision, recall, f1_score = model.evaluate(x_test, y_test)
predictions = model.predict(x_test)
# # Print evaluation metrics
# Evaluate performance
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 90.16%
Confusion Matrix:
[[2024  466]
 [  19 2419]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.81      0.89      2490
           1       0.84      0.99      0.91      2438

    accuracy                           0.90      4928
   macro avg       0.91      0.90      0.90      4928
weighted avg       0.92      0.90      0.90      4928



LOGISTIC REGRESSION; SELF MODEL; CLEANED DATA

In [4]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

class LogisticRegression:
    def __init__(self, learning_rate=0.1, n_iters=2000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient descent
        for _ in range(self.n_iters):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self._sigmoid(linear_model)

            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

    def _sigmoid(self, x):
      x = np.clip(x, -500, 500)
      return 1 / (1 + np.exp(-x))



df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/Fraud.csv')

# Encode categorical variables using LabelEncoder
encoder = {}
for col in df.select_dtypes('object').columns:
    encoder[col] = LabelEncoder()
    df[col] = encoder[col].fit_transform(df[col])

# Separate features (x) and target (y)
maskFraud = df['isFraud'] == 1

fraud = df[maskFraud]
non_fraud = df[maskFraud == False]

df_balanced = pd.concat([fraud, non_fraud.sample(len(fraud), random_state= 42)])
y = df_balanced['isFraud' ]
columns_dropped = ["step", "nameOrig", "nameDest", "oldbalanceDest", "newbalanceDest", "isFraud", "isFlaggedFraud"]
df_balanced.drop(columns = columns_dropped, inplace = True)
X = df_balanced


# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize and train the logistic regression model
model = LogisticRegression(learning_rate=0.1, n_iters=2000)
model.fit(x_train, y_train)

# # Evaluate the model

predictions = model.predict(x_test)
# # Print evaluation metrics
# Evaluate performance
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 90.50%
Confusion Matrix:
[[2041  449]
 [  19 2419]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.82      0.90      2490
           1       0.84      0.99      0.91      2438

    accuracy                           0.91      4928
   macro avg       0.92      0.91      0.90      4928
weighted avg       0.92      0.91      0.90      4928



LOGISTIC REGRESSION; SELF MODEL;SCALED VALUES; CLEANED DATA

In [5]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

class LogisticRegression:
    def __init__(self, learning_rate=0.1, n_iters=2000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient descent
        for _ in range(self.n_iters):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self._sigmoid(linear_model)

            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

    def _sigmoid(self, x):
        x = np.clip(x, -500, 500)
        return 1 / (1 + np.exp(-x))


def preprocess_data(df, scaler):
    encoder = {}
    for col in df.select_dtypes('object').columns:
        encoder[col] = LabelEncoder()
        df[col] = encoder[col].fit_transform(df[col])

    maskFraud = df['isFraud'] == 1
    fraud = df[maskFraud]
    non_fraud = df[maskFraud == False]
    df_balanced = pd.concat([fraud, non_fraud.sample(len(fraud), random_state=42)])
    y = df_balanced['isFraud']
    columns_dropped = ["step", "nameOrig", "nameDest", "oldbalanceDest", "newbalanceDest", "isFraud", "isFlaggedFraud"]
    df_balanced.drop(columns=columns_dropped, inplace=True)
    X = df_balanced

    # Scale the features
    X_scaled = scaler.fit_transform(X)

    return X_scaled, y

# Load the data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/Fraud.csv')

# Create scalers
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Preprocess the data with MinMaxScaler
X_mm, y_mm = preprocess_data(df, min_max_scaler)

# Preprocess the data with StandardScaler
X_ss, y_ss = preprocess_data(df, standard_scaler)

# Split the data into training and testing sets
x_train_mm, x_test_mm, y_train_mm, y_test_mm = train_test_split(X_mm, y_mm, test_size=0.3, random_state=0)
x_train_ss, x_test_ss, y_train_ss, y_test_ss = train_test_split(X_ss, y_ss, test_size=0.3, random_state=0)

# Initialize and train the logistic regression model with MinMaxScaler
model_mm = LogisticRegression(learning_rate=0.1, n_iters=2000)
model_mm.fit(x_train_mm, y_train_mm)

# Initialize and train the logistic regression model with StandardScaler
model_ss = LogisticRegression(learning_rate=0.1, n_iters=2000)
model_ss.fit(x_train_ss, y_train_ss)

# Evaluate the model with MinMaxScaler
predictions_mm = model_mm.predict(x_test_mm)
accuracy_mm = accuracy_score(y_test_mm, predictions_mm)
print(f"Accuracy with MinMaxScaler: {accuracy_mm * 100:.2f}%")
print("Confusion Matrix with MinMaxScaler:")
print(confusion_matrix(y_test_mm, predictions_mm))
print("\nClassification Report with MinMaxScaler:")
print(classification_report(y_test_mm, predictions_mm))

# Evaluate the model with StandardScaler
predictions_ss = model_ss.predict(x_test_ss)
accuracy_ss = accuracy_score(y_test_ss, predictions_ss)
print(f"\nAccuracy with StandardScaler: {accuracy_ss * 100:.2f}%")
print("Confusion Matrix with StandardScaler:")
print(confusion_matrix(y_test_ss, predictions_ss))
print("\nClassification Report with StandardScaler:")
print(classification_report(y_test_ss, predictions_ss))


Accuracy with MinMaxScaler: 57.22%
Confusion Matrix with MinMaxScaler:
[[1482 1008]
 [1100 1338]]

Classification Report with MinMaxScaler:
              precision    recall  f1-score   support

           0       0.57      0.60      0.58      2490
           1       0.57      0.55      0.56      2438

    accuracy                           0.57      4928
   macro avg       0.57      0.57      0.57      4928
weighted avg       0.57      0.57      0.57      4928


Accuracy with StandardScaler: 81.15%
Confusion Matrix with StandardScaler:
[[2313  177]
 [ 752 1686]]

Classification Report with StandardScaler:
              precision    recall  f1-score   support

           0       0.75      0.93      0.83      2490
           1       0.90      0.69      0.78      2438

    accuracy                           0.81      4928
   macro avg       0.83      0.81      0.81      4928
weighted avg       0.83      0.81      0.81      4928

