In [64]:
#importing the nessary datas
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

In [65]:
#opening train data
train_data = pd.read_csv("/content/fraudTest.csv")


In [66]:
#opening test data
test_data=pd.read_csv("/content/fraudTest.csv")

In [67]:
combined_data = pd.concat([train_data, test_data], axis=0)


In [68]:
def extract_datetime_features(df):
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
    df['hour_of_day'] = df['trans_date_trans_time'].dt.hour
    df.drop('trans_date_trans_time', axis=1, inplace=True)
    return df



In [69]:
combined_data = extract_datetime_features(combined_data)

In [70]:

# Drop irrelevant columns (you can customize this based on your data)
columns_to_drop = ["first", "last", "job", "dob", "trans_num", "street"]
combined_data.drop(columns_to_drop, axis=1, inplace=True)


In [71]:
X_combined = combined_data.drop("is_fraud", axis=1)
y_combined = combined_data["is_fraud"]


In [72]:
X_combined = combined_data.drop("is_fraud", axis=1)
y_combined = combined_data["is_fraud"]

In [73]:
# Encode the "merchant" and "category" columns using LabelEncoder
label_encoder = LabelEncoder()
X_combined["merchant"] = label_encoder.fit_transform(X_combined["merchant"])
X_combined["category"] = label_encoder.fit_transform(X_combined["category"])



In [74]:
categorical_columns = ["gender", "city", "state"]
onehot_encoder = OneHotEncoder(sparse=False, drop="first", handle_unknown='ignore')
X_combined_categorical = onehot_encoder.fit_transform(X_combined[categorical_columns])



In [75]:
scaler = StandardScaler()
X_combined_numeric = scaler.fit_transform(X_combined.drop(categorical_columns, axis=1))

In [76]:
X_combined_encoded = np.hstack((X_combined_numeric, X_combined_categorical))


In [77]:
X_train = X_combined_encoded[:len(train_data)]
X_test = X_combined_encoded[len(train_data):]
y_train = y_combined[:len(train_data)]
y_test = y_combined[len(train_data):]


In [78]:
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)



In [79]:
n_components = 100  # Adjust the number of components as needed
ipca = IncrementalPCA(n_components=n_components)

**Incremental PCA**

In [80]:
# Apply Incremental PCA to training data with progress bar
for batch in tqdm(np.array_split(X_resampled, 10), desc="Applying Incremental PCA"):
    ipca.partial_fit(batch)

Applying Incremental PCA: 100%|██████████| 10/10 [00:08<00:00,  1.22it/s]


In [81]:
# Transform the training and testing data
X_resampled_pca = ipca.transform(X_resampled)
X_test_pca = ipca.transform(X_test)

**Logistic Reggression**

In [82]:
# Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_resampled_pca, y_resampled)
y_pred_logreg = logreg.predict(X_test_pca)

In [83]:
# Decision Tree
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_resampled_pca, y_resampled)
y_pred_dt = dt_classifier.predict(X_test_pca)


**Xgboost**

In [84]:
param_grid_xgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
}


In [85]:
try:
    grid_search_xgb.fit(X_resampled_pca, y_resampled)
except KeyboardInterrupt:
    pass

In [86]:
xgb_classifier = XGBClassifier(random_state=42)
grid_search_xgb = GridSearchCV(xgb_classifier, param_grid_xgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_xgb.fit(X_resampled_pca, y_resampled)


In [87]:
best_xgb = grid_search_xgb.best_estimator_
best_xgb.fit(X_resampled_pca, y_resampled)

In [88]:
# Predict using the best XGBoost model
y_pred_xgb = best_xgb.predict(X_test_pca)


In [89]:
# Evaluate Logistic Regression
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
confusion_logreg = confusion_matrix(y_test, y_pred_logreg)
report_logreg = classification_report(y_test, y_pred_logreg)



In [90]:
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_logreg}")
print(f"Confusion Matrix:\n{confusion_logreg}")
print(f"Classification Report:\n{report_logreg}")

Logistic Regression Results:
Accuracy: 0.9984003199360127
Confusion Matrix:
[[4975    8]
 [   0   18]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4983
           1       0.69      1.00      0.82        18

    accuracy                           1.00      5001
   macro avg       0.85      1.00      0.91      5001
weighted avg       1.00      1.00      1.00      5001



In [91]:
accuracy_dt = accuracy_score(y_test, y_pred_dt)
confusion_dt = confusion_matrix(y_test, y_pred_dt)
report_dt = classification_report(y_test, y_pred_dt)


In [92]:
print("\nDecision Tree Results:")
print(f"Accuracy: {accuracy_dt}")
print(f"Confusion Matrix:\n{confusion_dt}")
print(f"Classification Report:\n{report_dt}")


Decision Tree Results:
Accuracy: 1.0
Confusion Matrix:
[[4983    0]
 [   0   18]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4983
           1       1.00      1.00      1.00        18

    accuracy                           1.00      5001
   macro avg       1.00      1.00      1.00      5001
weighted avg       1.00      1.00      1.00      5001



In [93]:
# Evaluate XGBoost
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
confusion_xgb = confusion_matrix(y_test, y_pred_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)

In [94]:
print("\nXGBoost Results:")
print(f"Accuracy: {accuracy_xgb}")
print(f"Confusion Matrix:\n{confusion_xgb}")
print(f"Classification Report:\n{report_xgb}")


XGBoost Results:
Accuracy: 0.9996000799840032
Confusion Matrix:
[[4981    2]
 [   0   18]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4983
           1       0.90      1.00      0.95        18

    accuracy                           1.00      5001
   macro avg       0.95      1.00      0.97      5001
weighted avg       1.00      1.00      1.00      5001

