In [9]:
import pandas as pd
import gdown
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Download training data from Google Drive
train_url = 'https://drive.google.com/uc?id=1B3kwg8JWxZFkip2aEILTE49yqFGW0LNj'
gdown.download(train_url, 'fraudTrain.csv', quiet=False)

# Load the training data
train_data = pd.read_csv('fraudTrain.csv')


Downloading...
From (original): https://drive.google.com/uc?id=1B3kwg8JWxZFkip2aEILTE49yqFGW0LNj
From (redirected): https://drive.google.com/uc?id=1B3kwg8JWxZFkip2aEILTE49yqFGW0LNj&confirm=t&uuid=9ae3831c-4987-417f-b02a-ea886421b5a8
To: C:\Users\shree\fraudTrain.csv
100%|███████████████████████████████████████████████████████████████████████████████| 351M/351M [01:02<00:00, 5.62MB/s]


In [12]:
# Drop irrelevant columns if they exist
columns_to_drop = ['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long']
train_data = train_data.drop(columns=[col for col in columns_to_drop if col in train_data.columns])

# Split data into features and target
X_train = train_data.drop('is_fraud', axis=1)
y_train = train_data['is_fraud']
# Define categorical columns for one-hot encoding
categorical_cols = ['city', 'state']
# Perform one-hot encoding
ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), categorical_cols)], remainder='passthrough')
X_train_encoded = ct.fit_transform(X_train)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_encoded, y_train)

In [13]:
# Download testing data from Google Drive
test_url = 'https://drive.google.com/uc?id=1rQ6z8dEqS9x1THwwdOsXxvhMpax43lEQ'
gdown.download(test_url, 'fraudTest.csv', quiet=False)

# Load the testing data
test_data = pd.read_csv('fraudTest.csv')

# Drop irrelevant columns from test data if they exist
test_data = test_data.drop(columns=[col for col in columns_to_drop if col in test_data.columns])


Downloading...
From (original): https://drive.google.com/uc?id=1rQ6z8dEqS9x1THwwdOsXxvhMpax43lEQ
From (redirected): https://drive.google.com/uc?id=1rQ6z8dEqS9x1THwwdOsXxvhMpax43lEQ&confirm=t&uuid=a1dc1585-167a-478f-a5de-e6bbb4ccf792
To: C:\Users\shree\fraudTest.csv
100%|███████████████████████████████████████████████████████████████████████████████| 150M/150M [00:38<00:00, 3.91MB/s]


In [17]:
# Combine training and testing data
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Drop irrelevant columns if they exist
columns_to_drop = ['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long']
combined_data = combined_data.drop(columns=[col for col in columns_to_drop if col in combined_data.columns])

# Split data into features and target
X = combined_data.drop('is_fraud', axis=1)
y = combined_data['is_fraud']

# Define categorical columns for one-hot encoding
categorical_cols = ['city', 'state']

# Perform one-hot encoding
ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), categorical_cols)], remainder='passthrough')
X_encoded = ct.fit_transform(X)


In [18]:
# Split data back into training and testing sets
X_train_encoded = X_encoded[:len(train_data)]
X_test_encoded = X_encoded[len(train_data):]

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_encoded, y[:len(train_data)])

# Make predictions on the test data
y_pred = model.predict(X_test_encoded)

# Evaluate the model
accuracy = accuracy_score(y[len(train_data):], y_pred)
report = classification_report(y[len(train_data):], y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


Accuracy: 0.9944900210358113
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.02      0.01      0.01      2145

    accuracy                           0.99    555719
   macro avg       0.51      0.50      0.51    555719
weighted avg       0.99      0.99      0.99    555719

