In [None]:
# Mount Google Drive in the Colab environment
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

In [None]:
# Path to the dataset file on Google Drive
file_path = '/content/drive/MyDrive/clasification/PS_20174392719_1491204439457_log.csv'

# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [None]:
df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df['type'] = label.fit_transform(df['type'])
print("After Encoding",df['type'].unique())

After Encoding [3 4 1 2 0]


In [None]:
columns_to_scale = ['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']

# Perform feature scaling on the selected columns
scaler = StandardScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,-1.703042,0.952399,-0.28156,C1231006815,-0.22981,-0.237622,M1979787155,-0.323814,-0.333411,0,0
1,-1.703042,0.952399,-0.294767,C1666544295,-0.281359,-0.285812,M2044282225,-0.323814,-0.333411,0,0
2,-1.703042,1.693076,-0.297555,C1305486145,-0.288654,-0.292442,C553264065,-0.323814,-0.333411,1,0
3,-1.703042,-0.528954,-0.297555,C840083671,-0.288654,-0.292442,C38997010,-0.317582,-0.333411,1,0
4,-1.703042,0.952399,-0.278532,C2048537720,-0.274329,-0.282221,M1230701703,-0.323814,-0.333411,0,0


In [None]:
#Step 3: Preprocess the data
X = df[['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
y = df.isFraud

In [None]:
# Step 4: Split the data into train, test, and CV sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
# Step 5: Apply oversampling and undersampling together
oversampler = SMOTE()
undersampler = RandomUnderSampler()

In [None]:
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_resampled, y_train_resampled)

In [None]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_resampled.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_resampled.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_resampled==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_resampled==0)))

After OverSampling, the shape of train_X: (7625190, 7)
After OverSampling, the shape of train_y: (7625190,) 

After OverSampling, counts of label '1': 3812595
After OverSampling, counts of label '0': 3812595


In [None]:

X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_resampled, y_train_resampled)

# Step 6: Preprocess the resampled data
scaler = StandardScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)
X_cv = scaler.transform(X_cv)

# Step 7: Train the model
model = LogisticRegression()
model.fit(X_train_resampled, y_train_resampled)

# Step 7: Make predictions on the training set
y_train_pred = model.predict(X_train_resampled)

# Step 8: Evaluate the model on the training set
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
train_precision = precision_score(y_train_resampled, y_train_pred)
train_recall = recall_score(y_train_resampled, y_train_pred)
train_f1 = f1_score(y_train_resampled, y_train_pred)

# Step 9: Make predictions on the test set
y_test_pred = model.predict(X_test)

# Step 10: Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Step 11: Make predictions on the CV set
y_cv_pred = model.predict(X_cv)

# Step 12: Evaluate the model on the CV set
cv_accuracy = accuracy_score(y_cv, y_cv_pred)
cv_precision = precision_score(y_cv, y_cv_pred)
cv_recall = recall_score(y_cv, y_cv_pred)
cv_f1 = f1_score(y_cv, y_cv_pred)

# Step 13: Compare the metrics
print('Training Set Metrics:')
print('Accuracy:', train_accuracy)
print('Precision:', train_precision)
print('Recall:', train_recall)
print('F1-Score:', train_f1)
print()

print('Test Set Metrics:')
print('Accuracy:', test_accuracy)
print('Precision:', test_precision)
print('Recall:', test_recall)
print('F1-Score:', test_f1)
print()

print('CV Set Metrics:')
print('Accuracy:', cv_accuracy)
print('Precision:', cv_precision)
print('Recall:', cv_recall)
print('F1-Score:', cv_f1)


Training Set Metrics:
Accuracy: 0.9330302589181384
Precision: 0.9658043528555387
Recall: 0.8978501519306404
F1-Score: 0.930588349632014

Test Set Metrics:
Accuracy: 0.9683738774278521
Precision: 0.03478428444101848
Recall: 0.891358024691358
F1-Score: 0.06695569517538776

CV Set Metrics:
Accuracy: 0.9679951026463941
Precision: 0.03440394276326754
Recall: 0.8941831683168316
F1-Score: 0.06625856890661898


"\n# Step 7: Train the model\nmodel = LogisticRegression()\nmodel.fit(X_train_resampled, y_train_resampled)\n\n# Step 8: Make predictions on the test set\ny_pred = model.predict(X_test)\n\n# Step 9: Evaluate the model\naccuracy = accuracy_score(y_test, y_pred)\nprecision = precision_score(y_test, y_pred)\nrecall = recall_score(y_test, y_pred)\nf1 = f1_score(y_test, y_pred)\n\nprint('Accuracy:', accuracy)\nprint('Precision:', precision)\nprint('Recall:', recall)\nprint('F1-Score:', f1)\n\n# Step 10: Evaluate the model using cross-validation\ncv_scores = cross_val_score(model, X_cv, y_cv, cv=5, scoring='accuracy')\nmean_cv_accuracy = cv_scores.mean()\n\nprint('CV Accuracy:', mean_cv_accuracy)\n"