In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (roc_auc_score, roc_curve, precision_recall_curve,
                              average_precision_score, accuracy_score,
                              precision_score, recall_score, f1_score)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from collections import Counter
import matplotlib.patches as mpatches
import time
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD

In [5]:
pip install gdown



In [11]:
import gdown

In [18]:
!gdown 1b3uGV1endF3uIni2p8-drs6F9omBUPOK

Downloading...
From (original): https://drive.google.com/uc?id=1b3uGV1endF3uIni2p8-drs6F9omBUPOK
From (redirected): https://drive.google.com/uc?id=1b3uGV1endF3uIni2p8-drs6F9omBUPOK&confirm=t&uuid=0d319e5a-b2e2-4af1-b03c-a08945cca0cc
To: /content/archive.zip
100% 69.2M/69.2M [00:00<00:00, 200MB/s]


In [22]:
!unzip -q archive.zip

In [23]:

df = pd.read_csv('creditcard.csv')

print("Osnovne informacije o skupu podataka:")
print(f"Broj uzoraka: {df.shape[0]}")
print(f"Broj obeležja: {df.shape[1]}")
print(f"Nedostajuće vrednosti: {df.isnull().sum().sum()}")
print("\nRaspodela klasa:")
print(df['Class'].value_counts())

Osnovne informacije o skupu podataka:
Broj uzoraka: 284807
Broj obeležja: 31
Nedostajuće vrednosti: 0

Raspodela klasa:
Class
0    284315
1       492
Name: count, dtype: int64


In [None]:
plt.figure(figsize=(6,6))
colors = ['blue', 'red']
df['Class'].value_counts().sort_index().plot(
    kind='pie',
    colors=colors,
    autopct='%1.1f%%',
    startangle=90,
    labels=None
)
plt.title('Raspodela klasa (0 = legitimna, 1 = prevara)')
plt.ylabel('')
plt.legend(['Legitimna transakcija', 'Prevara'], title='Class',
           loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=False, cmap='coolwarm', cbar=True)
plt.title('Korelaciona matrica numeričkih obeležja')
plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))
sns.distplot(df['Amount'].values, ax=ax[0], color='r')
ax[0].set_title('Distribucija vrednosti transakcija', fontsize=14)
sns.distplot(df['Time'].values, ax=ax[1], color='b')
ax[1].set_title('Distribucija vremena transakcije', fontsize=14)
plt.show()

In [None]:
rob_scaler = RobustScaler()
df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

In [None]:
df.drop(['Time','Amount'], axis=1, inplace=True)
scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']
df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

In [None]:
fraud = df.loc[df['Class'] == 1]
non_fraud = df.loc[df['Class'] == 0]

In [None]:
df = df.sample(frac=1, random_state=42)
undersampled_non_fraud = non_fraud.sample(n=len(fraud), random_state=42)
df_undersampled = pd.concat([undersampled_non_fraud, fraud], axis=0)
df_undersampled = df_undersampled.sample(frac=1, random_state=42)

print(f"\nBalansirani skup: {df_undersampled.shape}")
print(df_undersampled['Class'].value_counts())

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df_undersampled.corr(), annot=False, cmap='coolwarm', cbar=True)
plt.title('Korelaciona matrica (Undersampled)')
plt.show()

In [None]:
def visoko_korelisana_obelezja(df, target_col, threshold=0.5):
    corr = df.corr(numeric_only=True)[target_col].drop(target_col)
    high_corr = corr[abs(corr) > threshold].sort_values(ascending=False)
    return high_corr.index.tolist()

korelisana = visoko_korelisana_obelezja(df_undersampled, 'Class')
print(f"\nVisoko korelisana obeležja: {korelisana}")

In [None]:
f, axes = plt.subplots(ncols=4, figsize=(20,4))
features = ['V17', 'V14', 'V12', 'V10']
for i, feature in enumerate(features):
    sns.boxplot(x="Class", y=feature, hue="Class", data=df_undersampled,
                ax=axes[i], dodge=False, legend=False)
    axes[i].set_title(f'{feature} vs Class')
plt.show()

In [None]:
def remove_outliers(df, columns, k=2):
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - k * IQR
        upper_bound = Q3 + k * IQR
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    return df_clean

df_outlier = remove_outliers(df_undersampled, korelisana)
print(f"\nSkup nakon uklanjanja outlier-a: {df_outlier.shape}")

In [None]:
f, axes = plt.subplots(ncols=4, figsize=(20,4))
for i, feature in enumerate(features):
    sns.boxplot(x="Class", y=feature, hue="Class", data=df_outlier,
                ax=axes[i], dodge=False, legend=False)
    axes[i].set_title(f'{feature} (bez outlier-a)')
plt.show()

In [None]:
X_under = df_outlier.drop(['Class'], axis=1)
y_under = df_outlier['Class']

print("\nRedukcija dimenzionalnosti:")
t0 = time.time()
X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X_under.values)
print(f"t-SNE: {time.time() - t0:.2f}s")

t0 = time.time()
X_reduced_pca = PCA(n_components=2, random_state=42).fit_transform(X_under.values)
print(f"PCA: {time.time() - t0:.2f}s")

t0 = time.time()
X_reduced_svd = TruncatedSVD(n_components=2, algorithm='randomized',
                              random_state=42).fit_transform(X_under.values)
print(f"SVD: {time.time() - t0:.2f}s")

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24,6))
f.suptitle('Klasteri nakon redukcije dimenzionalnosti', fontsize=14)
blue_patch = mpatches.Patch(color='#0A0AFF', label='Legitimna')
red_patch = mpatches.Patch(color='#AF0000', label='Prevara')

ax1.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y_under == 0),
            cmap='coolwarm', linewidths=2)
ax1.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y_under == 1),
            cmap='coolwarm', linewidths=2)
ax1.set_title('t-SNE', fontsize=14)
ax1.grid(True)
ax1.legend(handles=[blue_patch, red_patch])

ax2.scatter(X_reduced_pca[:,0], X_reduced_pca[:,1], c=(y_under == 0),
            cmap='coolwarm', linewidths=2)
ax2.scatter(X_reduced_pca[:,0], X_reduced_pca[:,1], c=(y_under == 1),
            cmap='coolwarm', linewidths=2)
ax2.set_title('PCA', fontsize=14)
ax2.grid(True)
ax2.legend(handles=[blue_patch, red_patch])

ax3.scatter(X_reduced_svd[:,0], X_reduced_svd[:,1], c=(y_under == 0),
            cmap='coolwarm', linewidths=2)
ax3.scatter(X_reduced_svd[:,0], X_reduced_svd[:,1], c=(y_under == 1),
            cmap='coolwarm', linewidths=2)
ax3.set_title('Truncated SVD', fontsize=14)
ax3.grid(True)
ax3.legend(handles=[blue_patch, red_patch])
plt.show()

In [None]:
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(
    X_under, y_under, test_size=0.2, random_state=42)


In [None]:
log_reg = LogisticRegression(C=0.01, penalty='l2', solver='lbfgs', max_iter=1000)
log_reg.fit(X_train_under, y_train_under)

In [None]:
knears = KNeighborsClassifier(n_neighbors=5, algorithm='auto')
knears.fit(X_train_under, y_train_under)

In [None]:
svc = SVC(C=1, kernel='rbf', probability=True, random_state=42)
svc.fit(X_train_under, y_train_under)

In [None]:
tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=4,
                                   min_samples_leaf=5, random_state=42)
tree_clf.fit(X_train_under, y_train_under)

In [None]:
X = df.drop("Class", axis=1).values
y = df["Class"].values

In [None]:
KFold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
scores = {"accuracy": [], "precision": [], "recall": [], "f1": [], "auc": []}

In [None]:
log_reg_pred = cross_val_predict(log_reg, X_train_under, y_train_under,
                                   cv=5, method="decision_function")
knears_pred = cross_val_predict(knears, X_train_under, y_train_under, cv=5)
svc_pred = cross_val_predict(svc, X_train_under, y_train_under,
                               cv=5, method="decision_function")
tree_pred = cross_val_predict(tree_clf, X_train_under, y_train_under, cv=5)

In [None]:
log_fpr, log_tpr, _ = roc_curve(y_train_under, log_reg_pred)
knear_fpr, knear_tpr, _ = roc_curve(y_train_under, knears_pred)
svc_fpr, svc_tpr, _ = roc_curve(y_train_under, svc_pred)
tree_fpr, tree_tpr, _ = roc_curve(y_train_under, tree_pred)

In [None]:
print(f'Logistic Regression: {roc_auc_score(y_train_under, log_reg_pred):.4f}')
print(f'K-Nearest Neighbors: {roc_auc_score(y_train_under, knears_pred):.4f}')
print(f'Support Vector Classifier: {roc_auc_score(y_train_under, svc_pred):.4f}')
print(f'Decision Tree: {roc_auc_score(y_train_under, tree_pred):.4f}')

In [None]:
plt.figure(figsize=(16,8))
plt.title('ROC Curve - Svi klasifikatori', fontsize=18)
plt.plot(log_fpr, log_tpr, label='Logistic Regression Score: {:.4f}'.format(
    roc_auc_score(y_train_under, log_reg_pred)))
plt.plot(knear_fpr, knear_tpr, label='K-Nearest Neighbors Score: {:.4f}'.format(
    roc_auc_score(y_train_under, knears_pred)))
plt.plot(svc_fpr, svc_tpr, label='Support Vector Classifier Score: {:.4f}'.format(
    roc_auc_score(y_train_under, svc_pred)))
plt.plot(tree_fpr, tree_tpr, label='Decision Tree Score: {:.4f}'.format(
    roc_auc_score(y_train_under, tree_pred)))
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([-0.01, 1, 0, 1])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.annotate('Minimalni ROC skor 50%\n(Nasumično pogađanje)',
             xy=(0.5, 0.5), xytext=(0.6, 0.3),
             arrowprops=dict(facecolor='#6E726D', shrink=0.05))
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.title('Logistic Regression ROC Curve', fontsize=16)
plt.plot(log_fpr, log_tpr, 'b-', linewidth=2,
         label=f'AUC = {roc_auc_score(y_train_under, log_reg_pred):.4f}')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.axis([-0.01,1,0,1])
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
X_original = df.drop(['Class'], axis=1)
y_original = df['Class']

X_res, y_res = SMOTE(random_state=42).fit_resample(X_original, y_original)

print(f"\nBalansirani skup (SMOTE): {X_res.shape}")
print(f"Raspodela klasa:\n{pd.Series(y_res).value_counts()}")

In [None]:
plt.figure(figsize=(6,6))
pd.Series(y_res).value_counts().sort_index().plot(
    kind='pie',
    colors=colors,
    autopct='%1.1f%%',
    startangle=90,
    labels=None
)
plt.title('Raspodela klasa nakon SMOTE')
plt.ylabel('')
plt.legend(['Legitimna transakcija', 'Prevara'], title='Class',
           loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, train_size=0.8, stratify=y_res, random_state=42)

print(f"\nTrening skup: {X_train.shape}")
print(f"Test skup: {X_test.shape}")

In [None]:
df_smote = pd.concat([pd.DataFrame(X_res), pd.DataFrame(y_res, columns=['Class'])], axis=1)
plt.figure(figsize=(12, 8))
sns.heatmap(df_smote.corr(), annot=False, cmap='coolwarm', cbar=True)
plt.title('Korelaciona matrica (SMOTE)')
plt.show()

In [None]:
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(
    X_res, y_res, test_size=0.2, stratify=y_res, random_state=42)

print(f"\nTrening skup: {X_train_smote.shape}")
print(f"Test skup: {X_test_smote.shape}")

In [None]:
log_reg_smote = LogisticRegression(C=0.01, penalty='l2', solver='lbfgs', max_iter=1000)
log_reg_smote.fit(X_train_smote, y_train_smote)

In [None]:
knears_smote = KNeighborsClassifier(n_neighbors=5, algorithm='auto')
knears_smote.fit(X_train_smote, y_train_smote)

In [None]:
svc_smote = SVC(C=1, kernel='rbf', probability=True, random_state=42)
svc_smote.fit(X_train_smote, y_train_smote)

In [None]:
tree_clf_smote = DecisionTreeClassifier(criterion='gini', max_depth=4,
                                         min_samples_leaf=5, random_state=42)
tree_clf_smote.fit(X_train_smote, y_train_smote)

In [None]:
log_reg_pred_smote = cross_val_predict(log_reg_smote, X_train_smote, y_train_smote,
                                        cv=5, method="decision_function")
knears_pred_smote = cross_val_predict(knears_smote, X_train_smote, y_train_smote, cv=5)
svc_pred_smote = cross_val_predict(svc_smote, X_train_smote, y_train_smote,
                                    cv=5, method="decision_function")
tree_pred_smote = cross_val_predict(tree_clf_smote, X_train_smote, y_train_smote, cv=5)

In [None]:
print(f'Logistic Regression: {roc_auc_score(y_train_smote, log_reg_pred_smote):.4f}')
print(f'K-Nearest Neighbors: {roc_auc_score(y_train_smote, knears_pred_smote):.4f}')
print(f'Support Vector Classifier: {roc_auc_score(y_train_smote, svc_pred_smote):.4f}')
print(f'Decision Tree: {roc_auc_score(y_train_smote, tree_pred_smote):.4f}')

In [None]:
log_fpr_smote, log_tpr_smote, _ = roc_curve(y_train_smote, log_reg_pred_smote)
knear_fpr_smote, knear_tpr_smote, _ = roc_curve(y_train_smote, knears_pred_smote)
svc_fpr_smote, svc_tpr_smote, _ = roc_curve(y_train_smote, svc_pred_smote)
tree_fpr_smote, tree_tpr_smote, _ = roc_curve(y_train_smote, tree_pred_smote)

In [None]:
plt.figure(figsize=(16,8))
plt.title('ROC Curve - Svi klasifikatori (SMOTE)', fontsize=18)
plt.plot(log_fpr_smote, log_tpr_smote, label='Logistic Regression Score: {:.4f}'.format(
    roc_auc_score(y_train_smote, log_reg_pred_smote)))
plt.plot(knear_fpr_smote, knear_tpr_smote, label='K-Nearest Neighbors Score: {:.4f}'.format(
    roc_auc_score(y_train_smote, knears_pred_smote)))
plt.plot(svc_fpr_smote, svc_tpr_smote, label='Support Vector Classifier Score: {:.4f}'.format(
    roc_auc_score(y_train_smote, svc_pred_smote)))
plt.plot(tree_fpr_smote, tree_tpr_smote, label='Decision Tree Score: {:.4f}'.format(
    roc_auc_score(y_train_smote, tree_pred_smote)))
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([-0.01, 1, 0, 1])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.annotate('Minimalni ROC skor 50%\n(Nasumično pogađanje)',
             xy=(0.5, 0.5), xytext=(0.6, 0.3),
             arrowprops=dict(facecolor='#6E726D', shrink=0.05))
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.title('Logistic Regression ROC Curve (SMOTE)', fontsize=16)
plt.plot(log_fpr_smote, log_tpr_smote, 'b-', linewidth=2,
         label=f'AUC = {roc_auc_score(y_train_smote, log_reg_pred_smote):.4f}')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.axis([-0.01,1,0,1])
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
y_pred_log_smote = log_reg_smote.predict(X_test_smote)
y_pred_knears_smote = knears_smote.predict(X_test_smote)
y_pred_svc_smote = svc_smote.predict(X_test_smote)
y_pred_tree_smote = tree_clf_smote.predict(X_test_smote)

In [None]:
models = ['Logistic Regression', 'K-Nearest Neighbors', 'SVC', 'Decision Tree']
predictions = [y_pred_log_smote, y_pred_knears_smote, y_pred_svc_smote, y_pred_tree_smote]

for model_name, y_pred in zip(models, predictions):
    print(f"\n{model_name}:")
    print(f"  Accuracy:  {accuracy_score(y_test_smote, y_pred):.4f}")
    print(f"  Precision: {precision_score(y_test_smote, y_pred):.4f}")
    print(f"  Recall:    {recall_score(y_test_smote, y_pred):.4f}")
    print(f"  F1 Score:  {f1_score(y_test_smote, y_pred):.4f}")

# Average Precision Score za najbolji model
smote_y_score = log_reg_smote.decision_function(X_test_smote)
smote_avg_precision = average_precision_score(y_test_smote, smote_y_score)
print(f"\nAverage Precision-Recall Score (Logistic Regression): {smote_avg_precision:.2f}")