In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report



In [26]:
# Load the datasets
train_data = pd.read_csv('/content/drive/MyDrive/Codeway datasets/Task 2/fraudTrain.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Codeway datasets/Task 2/fraudTest.csv')



In [27]:
# Combine train and test data for preprocessing
combined_data = pd.concat([train_data, test_data], axis=0)

# Select relevant columns
relevant_columns = ['category', 'amt', 'gender', 'city_pop', 'job', 'is_fraud']
combined_data = combined_data[relevant_columns]

In [28]:
# Data preprocessing

# Handle missing values
combined_data.fillna(0, inplace=True)



In [29]:
# Encoding categorical variables
label_encoder = LabelEncoder()
combined_data['category'] = label_encoder.fit_transform(combined_data['category'])
combined_data['gender'] = label_encoder.fit_transform(combined_data['gender'])
combined_data['job'] = label_encoder.fit_transform(combined_data['job'])


In [31]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['amt', 'city_pop']
combined_data[numerical_features] = scaler.fit_transform(combined_data[numerical_features])




In [32]:
# Split the combined data back into train and test
train_data = combined_data[:len(train_data)]
test_data = combined_data[len(train_data):]



In [33]:
# Define features and target variable
X_train = train_data.drop(['is_fraud'], axis=1)
y_train = train_data['is_fraud']
X_test = test_data.drop(['is_fraud'], axis=1)
y_test = test_data['is_fraud']



In [34]:
# Model training and evaluation

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_pred))



Logistic Regression Accuracy: 0.9955049224518147
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719



In [35]:
# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)
print("\nDecision Tree Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_pred))




Decision Tree Accuracy: 0.9963380773376472
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.52      0.58      0.55      2145

    accuracy                           1.00    555719
   macro avg       0.76      0.79      0.77    555719
weighted avg       1.00      1.00      1.00    555719



In [36]:
# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print("\nRandom Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred))



Random Forest Accuracy: 0.9974213586362892
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.71      0.56      0.63      2145

    accuracy                           1.00    555719
   macro avg       0.85      0.78      0.81    555719
weighted avg       1.00      1.00      1.00    555719

