In [20]:
# Step 1: START
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


# Step 2: Reading the dataset
# Replace 'file_name.csv' with your actual dataset file
file_name = 'data-purchase-card-pcard-fiscal-year-2014.csv'
df = pd.read_csv(file_name)

# Step 3: Data Cleaning and Preprocessing
# Assuming 'Class' column contains 0 for normal and 1 for fraud
# Check for null values and drop rows with missing values
df.dropna(inplace=True)

# Separating the fraud and normal transactions
normal = df[df['Class'] == 0]
fraud = df[df['Class'] == 1]

# Under-sampling the normal transactions to balance the dataset
normal_sampled = normal.sample(len(fraud), random_state=42)
balanced_data = pd.concat([normal_sampled, fraud])

# Splitting features and target
X = balanced_data.drop(columns=['Class'])  # Features
y = balanced_data['Class']  # Target

# Identifying numerical and categorical features
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

# Creating a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)])

# Scaling and normalizing the features using the pipeline
X_scaled = preprocessor.fit_transform(X)


# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Step 4: Training the data using the SVM algorithm
svm_classifier = SVC(kernel='linear', random_state=50)
svm_classifier.fit(X_train, y_train)

# Predicting the test data
y_pred = svm_classifier.predict(X_test)

# Step 5: Calculating performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Storing the results
results = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall
}

# Displaying the results
print("Results:")
for metric, value in results.items():
    print(f"{metric}: {value:.2f}")

# Step 6: Displaying Fraudulent and Non-Fraudulent Transactions in the Results

# Create a DataFrame to display predictions along with actual values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Display the first few rows of the results
print("\nSample Predictions:")
print(results_df.head(10))

# Count fraudulent and non-fraudulent transactions in the test set
fraudulent_count = results_df[results_df['Actual'] == 1].shape[0]
non_fraudulent_count = results_df[results_df['Actual'] == 0].shape[0]

print("\nTransaction Counts in Test Set:")
print(f"Fraudulent Transactions: {fraudulent_count}")
print(f"Non-Fraudulent Transactions: {non_fraudulent_count}")

# Analyze predictions for fraudulent transactions
fraud_predictions = results_df[results_df['Actual'] == 1]
correct_fraud_predictions = fraud_predictions[fraud_predictions['Actual'] == fraud_predictions['Predicted']].shape[0]
incorrect_fraud_predictions = fraud_predictions.shape[0] - correct_fraud_predictions

print("\nFraudulent Transaction Predictions:")
print(f"Correctly Predicted Fraudulent Transactions: {correct_fraud_predictions}")
print(f"Incorrectly Predicted Fraudulent Transactions: {incorrect_fraud_predictions}")

Results:
Accuracy: 0.55
Precision: 0.54
Recall: 0.57

Sample Predictions:
        Actual  Predicted
42756        1          1
1686         0          1
111690       1          1
128360       0          1
20746        0          1
27627        1          0
93788        1          0
193189       0          0
102442       1          1
4920         1          1

Transaction Counts in Test Set:
Fraudulent Transactions: 146
Non-Fraudulent Transactions: 150

Fraudulent Transaction Predictions:
Correctly Predicted Fraudulent Transactions: 83
Incorrectly Predicted Fraudulent Transactions: 63


In [5]:
# Step 1: START
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
# Import ColumnTransformer from sklearn.compose
from sklearn.compose import ColumnTransformer
# Import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Step 2: Loading the dataset
# Replace 'file_name.csv' with your actual dataset file
file_name = 'data-purchase-card-pcard-fiscal-year-2014.csv'
df = pd.read_csv(file_name)

# Step 3: Cleaning and Normalization of Data
# Assuming 'Class' column contains 0 for normal and 1 for fraud
# Drop rows with missing values
df.dropna(inplace=True)

# Separating the fraud and normal transactions
normal = df[df['Class'] == 0]
fraud = df[df['Class'] == 1]

# Resampling: Under-sampling normal transactions to balance the dataset
normal_sampled = normal.sample(len(fraud), random_state=42)
balanced_data = pd.concat([normal_sampled, fraud])

# Splitting features and target
X = balanced_data.drop(columns=['Class'])  # Features
y = balanced_data['Class']  # Target

# Identifying numerical and categorical features
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

# Creating a preprocessing pipeline for both numerical and categorical features
# Use OneHotEncoder to transform categorical features into numerical representations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ])

# Fit and transform the data using the preprocessor
X_processed = preprocessor.fit_transform(X)

# Splitting the dataset into train and test sets using the processed data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)

# Step 4: Train the Model and Fit the Trained Model
# Initializing the KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust 'n_neighbors'
knn_classifier.fit(X_train, y_train)

# Predicting the test data
y_pred = knn_classifier.predict(X_test)

# Step 5: Calculating performance metrics and counts
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Counting the number of fraudulent and valid transactions
fraudulent_transactions = np.sum(y_pred == 1)
non_fraudulent_transactions = np.sum(y_pred == 0)

# Storing the results
results = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "Fraudulent Transactions (Predicted)": fraudulent_transactions,
    "Non-Fraudulent Transactions (Predicted)": non_fraudulent_transactions
}

# Displaying the results
print("Results:")
for metric, value in results.items():
    print(f"{metric}: {value:.2f}" if isinstance(value, float) else f"{metric}: {value}")

# Step 6: Displaying Fraudulent and Non-Fraudulent Transactions in the Results

# Create a DataFrame to display predictions along with actual values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Display the first few rows of the results
print("\nSample Predictions:")
print(results_df.head(10))

# Count fraudulent and non-fraudulent transactions in the test set
fraudulent_count = results_df[results_df['Actual'] == 1].shape[0]
non_fraudulent_count = results_df[results_df['Actual'] == 0].shape[0]

print("\nTransaction Counts in Test Set:")
print(f"Fraudulent Transactions: {fraudulent_count}")
print(f"Non-Fraudulent Transactions: {non_fraudulent_count}")

# Analyze predictions for fraudulent transactions
fraud_predictions = results_df[results_df['Actual'] == 1]
correct_fraud_predictions = fraud_predictions[fraud_predictions['Actual'] == fraud_predictions['Predicted']].shape[0]
incorrect_fraud_predictions = fraud_predictions.shape[0] - correct_fraud_predictions

print("\nFraudulent Transaction Predictions:")
print(f"Correctly Predicted Fraudulent Transactions: {correct_fraud_predictions}")
print(f"Incorrectly Predicted Fraudulent Transactions: {incorrect_fraud_predictions}")


Results:
Accuracy: 0.55
Precision: 0.54
Recall: 0.62
Fraudulent Transactions (Predicted): 166
Non-Fraudulent Transactions (Predicted): 130

Sample Predictions:
        Actual  Predicted
42756        1          0
1686         0          1
111690       1          0
128360       0          1
20746        0          1
27627        1          1
93788        1          0
193189       0          0
102442       1          1
4920         1          1

Transaction Counts in Test Set:
Fraudulent Transactions: 146
Non-Fraudulent Transactions: 150

Fraudulent Transaction Predictions:
Correctly Predicted Fraudulent Transactions: 90
Incorrectly Predicted Fraudulent Transactions: 56
