In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

# Load the data
train_data = pd.read_csv('/content/fraudTrain.csv')
test_data = pd.read_csv('/content/fraudTest.csv')

# Preprocessing
# Dropping columns that won't be useful for the model like 'trans_date_trans_time' and 'merchant'
train_data = train_data.drop(columns=['trans_date_trans_time', 'merchant', 'cc_num', 'first', 'last', 'street', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long'])
test_data = test_data.drop(columns=['trans_date_trans_time', 'merchant', 'cc_num', 'first', 'last', 'street', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long'])

# Handle missing values (drop rows with NaN in target)
train_data = train_data.dropna(subset=['is_fraud'])

# Split features and target
X = train_data.drop(columns=['is_fraud'])
y = train_data['is_fraud']

# One-Hot Encoding for categorical variables
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Column transformer to handle both categorical encoding and scaling for numerical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Logistic Regression Model Pipeline
log_reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('classifier', LogisticRegression(max_iter=1000))])
log_reg_pipeline.fit(X_train, y_train)
y_pred_log_reg = log_reg_pipeline.predict(X_val)

# 2. Decision Tree Model Pipeline
decision_tree_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                         ('classifier', DecisionTreeClassifier(random_state=42))])
decision_tree_pipeline.fit(X_train, y_train)
y_pred_tree = decision_tree_pipeline.predict(X_val)

# 3. Random Forest Model Pipeline
random_forest_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                         ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])
random_forest_pipeline.fit(X_train, y_train)
y_pred_forest = random_forest_pipeline.predict(X_val)

# Evaluation function
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

# Evaluate all models
print("Logistic Regression:", evaluate_model(y_val, y_pred_log_reg))
print("Decision Tree:", evaluate_model(y_val, y_pred_tree))
print("Random Forest:", evaluate_model(y_val, y_pred_forest))

# Test on test_data (follow same preprocessing steps)
X_test = test_data.drop(columns=['is_fraud'])
y_test = test_data['is_fraud']

# Predict on test data using the best model (Random Forest, in this case)
y_pred_test_forest = random_forest_pipeline.predict(X_test)
print("Test Set Random Forest Performance:", evaluate_model(y_test, y_pred_test_forest))


Logistic Regression: (0.9928194011651538, 0.7659574468085106, 0.2748091603053435, 0.4044943820224719)
Decision Tree: (0.9956984148489365, 0.7566539923954373, 0.7595419847328244, 0.758095238095238)
Random Forest: (0.9971887278146593, 0.9735449735449735, 0.7022900763358778, 0.8159645232815964)
Test Set Random Forest Performance: (0.9966295915741589, 0.66, 0.26153846153846155, 0.3746243739565943)
