In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [17]:
train_data = pd.read_csv(r"C:\Users\sssuj\Downloads\fraudTrain.csv\fraudTrain.csv")
test_data = pd.read_csv(r"C:\Users\sssuj\Downloads\fraudTest.csv\fraudTest.csv")

In [18]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [19]:
columns_to_drop = ['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'city', 'state', 'trans_num', 'trans_date_trans_time']
train_data = train_data.drop(columns=columns_to_drop)
test_data = test_data.drop(columns=columns_to_drop)

In [20]:
train_data['trans_year'] = pd.to_datetime(train_data['unix_time'], unit='s').dt.year
train_data['trans_month'] = pd.to_datetime(train_data['unix_time'], unit='s').dt.month
train_data['trans_day'] = pd.to_datetime(train_data['unix_time'], unit='s').dt.day
train_data['trans_hour'] = pd.to_datetime(train_data['unix_time'], unit='s').dt.hour

In [21]:
test_data['trans_year'] = pd.to_datetime(test_data['unix_time'], unit='s').dt.year
test_data['trans_month'] = pd.to_datetime(test_data['unix_time'], unit='s').dt.month
test_data['trans_day'] = pd.to_datetime(test_data['unix_time'], unit='s').dt.day
test_data['trans_hour'] = pd.to_datetime(test_data['unix_time'], unit='s').dt.hour

In [22]:
train_data = train_data.drop(columns=['unix_time'])
test_data = test_data.drop(columns=['unix_time'])

In [23]:
X_train = train_data.drop(columns='is_fraud')
y_train = train_data['is_fraud']
X_test = test_data.drop(columns='is_fraud')
y_test = test_data['is_fraud']

In [24]:
categorical_cols = ['merchant', 'category', 'gender', 'job']
numerical_cols = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long', 'trans_year', 'trans_month', 'trans_day', 'trans_hour']

In [25]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [26]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [28]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_preprocessed, y_train)
y_pred_logreg = logreg.predict(X_test_preprocessed)

In [29]:
tree = DecisionTreeClassifier()
tree.fit(X_train_preprocessed, y_train)
y_pred_tree = tree.predict(X_test_preprocessed)

In [30]:
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_logreg)}")
print(classification_report(y_test, y_pred_logreg))

Logistic Regression Results:
Accuracy: 0.9957154605115175
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.03      0.00      0.01      2145

    accuracy                           1.00    555719
   macro avg       0.51      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719



In [31]:
print("\nDecision Tree Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tree)}")
print(classification_report(y_test, y_pred_tree))


Decision Tree Results:
Accuracy: 0.9978136432261628
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.72      0.71      0.71      2145

    accuracy                           1.00    555719
   macro avg       0.86      0.85      0.86    555719
weighted avg       1.00      1.00      1.00    555719

