# 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE  

In [None]:
train_data = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv') 

test_data = pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv')

In [None]:
# Drop unnecessary columns (only include columns that exist in the dataset)
columns_to_drop = ['Unnamed: 0', 'trans_num', 'unix_time', 'first', 'last', 'long', 'lat']
train_data.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')
test_data.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')

# Define features (X) and target (y) for the training dataset
X_train = train_data.drop('is_fraud', axis=1)  # Features (all columns except 'is_fraud')
y_train = train_data['is_fraud']  # Target variable ('is_fraud')

# Define features (X) and target (y) for the testing dataset
X_test = test_data.drop('is_fraud', axis=1)  # Features (all columns except 'is_fraud')
y_test = test_data['is_fraud']  # Target variable ('is_fraud')

# Dynamically identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Convert 'trans_date_trans_time' to datetime and extract features
if 'trans_date_trans_time' in X_train.columns:
    # Convert 'trans_date_trans_time' to datetime format
    X_train['trans_date_trans_time'] = pd.to_datetime(X_train['trans_date_trans_time'])
    X_test['trans_date_trans_time'] = pd.to_datetime(X_test['trans_date_trans_time'])

    # Extract useful features from 'trans_date_trans_time'
    X_train['hour'] = X_train['trans_date_trans_time'].dt.hour
    X_train['day'] = X_train['trans_date_trans_time'].dt.day
    X_train['month'] = X_train['trans_date_trans_time'].dt.month
    X_train['day_of_week'] = X_train['trans_date_trans_time'].dt.dayofweek

    X_test['hour'] = X_test['trans_date_trans_time'].dt.hour
    X_test['day'] = X_test['trans_date_trans_time'].dt.day
    X_test['month'] = X_test['trans_date_trans_time'].dt.month
    X_test['day_of_week'] = X_test['trans_date_trans_time'].dt.dayofweek

    # Drop the original 'trans_date_trans_time' column
    X_train.drop('trans_date_trans_time', axis=1, inplace=True, errors='ignore')
    X_test.drop('trans_date_trans_time', axis=1, inplace=True, errors='ignore')

    # Update categorical and numerical columns after preprocessing
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_cols:
    # Fit LabelEncoder on the union of train and test data to avoid unseen labels
    combined = pd.concat([X_train[col], X_test[col]])
    label_encoder.fit(combined)

    # Transform both train and test data
    X_train[col] = label_encoder.transform(X_train[col])
    X_test[col] = label_encoder.transform(X_test[col])

# Scale numerical features using StandardScaler
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])  # Use transform (not fit_transform) for test data

# Ensure X_train and X_test have the same columns
X_train, X_test = X_train.align(X_test, join='inner', axis=1)

# Print the final set of columns to verify
print("Final set of columns in X_train and X_test:")
print(X_train.columns)

In [None]:
# Initialize the Random Forest Classifier with optimized parameters
model = RandomForestClassifier(
    n_estimators=50,       # Reduce the number of trees
    n_jobs=-1,             # Use all available CPU cores
    random_state=42
)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy Score:',accuracy_score(y_test, y_pred)*100)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
from xgboost import XGBClassifier

XGB = XGBClassifier(n_estimators = 300, max_depth = 30, learning_rate = 0.005)
XGB.fit(X_train,y_train)
XGB_preds = XGB.predict(X_test)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, XGB_preds)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlGnBu')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Accuracy score
accuracy = accuracy_score(y_test , XGB_preds)

print('Classification Report:\n',classification_report(y_test , XGB_preds))
print('Accuracy Score:',accuracy_score(y_test, XGB_preds)*100)