In [3]:
# Import necessary libraries
import numpy as np
import pandas as pd
from collections import Counter
from google.colab import drive
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
!pip install imblearn
from imblearn.over_sampling import SMOTE

# Mount Google Drive
drive.mount('/content/drive')

# 3)
# Load dataset from Google Drive
def load_dataset(file_path):
    return pd.read_csv(file_path)

# 4)
# Path to your dataset in Google Drive
file_path = '/content/drive/MyDrive/Colab Notebooks/Fraud.csv'

# Load data
df = load_dataset(file_path)
df.head()

# 5)
# Encode categorical variables using LabelEncoder
encoder = {}
for col in df.select_dtypes('object').columns:
  encoder[col] = LabelEncoder()
  df[col] = encoder[col].fit_transform(df[col])
encoder

# 6)
y = df['isFraud']
print(y)

# 7)
mask = df['isFraud'] == 1
fraud = df[mask]
print(fraud)

# 8)
nonFraud = df[~mask]
print(nonFraud)

# 9)
balanced_df = pd.concat([fraud, nonFraud.sample(n=len(fraud), random_state=42)])
print(len(balanced_df))

# 10)
# Separate features (X) and target (y)
X = balanced_df.drop(columns=['step', 'nameOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest','isFraud','isFlaggedFraud'])
print(X)

# 11)
# Separate features (X) and target (y)
y = balanced_df['isFraud']
print(y)

# 12)
# Euclidean distance
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

# KNN algorithm
def knn(train_data, test_point, k=3):
    distances = []
    for train_point in train_data:
        distance = euclidean_distance(test_point, train_point[:-1])
        distances.append((distance, train_point[-1]))
    distances.sort(key=lambda x: x[0])
    k_nearest_neighbors = distances[:k]
    k_nearest_labels = [label for _, label in k_nearest_neighbors]
    most_common_label = Counter(k_nearest_labels).most_common(1)[0][0]
    return most_common_label

# 13)
def predict_and_evaluate(x_train, y_train, x_test, y_test, k=3):
    # Ensure y_train is a DataFrame for hstack
    y_train_df = y_train.to_frame()

    # Combine x_train and y_train for KNN
    train_data = np.hstack((x_train, y_train_df.values))

    predictions = []
    for test_point in x_test:
        prediction = knn(train_data, test_point, k)
        predictions.append(prediction)

    accuracy = accuracy_score(y_test, predictions)
    return accuracy, predictions

# 14)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Run KNN algorithm defined above
k = 3
definedknnresult = knn(np.hstack((x_train.values, y_train.values.reshape(-1, 1))), x_test.values[0], k)
print(definedknnresult)

# Run predict_and_evaluate function
accuracy, predictions = predict_and_evaluate(x_train.values, y_train, x_test.values, y_test, k)
print(f"Accuracy: {accuracy}")

# Convert predictions to a numpy array for further evaluation
y_pred = np.array(predictions)

# 15)
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# 16)
class_report = classification_report(y_test, y_pred)
print(class_report)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
0          0
1          0
2          1
3          1
4          0
          ..
6362615    1
6362616    1
6362617    1
6362618    1
6362619    1
Name: isFraud, Length: 6362620, dtype: int64
         step  type      amount  nameOrig  oldbalanceOrg  newbalanceOrig  \
2           1     4      181.00   1002156         181.00             0.0   
3           1     1      181.00   5828262         181.00             0.0   
251         1     4     2806.00   1379875        2806.00             0.0   
252         1     1     2806.00   3619815        2806.00             0.0   
680         1     4    20128.00   1232211       20128.00             0.0   
...       ...   ...         ...       ...            ...             ...   
6362615   743     1   339682.13   5651847      339682.13             0.0   
6362616   743     4  6311409.28   1737278     6311409.28             0.0   