In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [5]:
# Load your data from updated path
train_df = pd.read_csv(r'D:\Priyanka\CREDIT CARD FRAUD DETECTION\archive\fraudTrain.csv')
test_df = pd.read_csv(r'D:\Priyanka\CREDIT CARD FRAUD DETECTION\archive\fraudTest.csv')

# Combine both datasets
df = pd.concat([train_df, test_df], ignore_index=True)
df.head()


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [6]:
# Drop useless columns including 'Unnamed: 0'
df.drop(columns=['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 'job', 'dob', 'trans_num'], inplace=True)

# Create separate LabelEncoders
merchant_le = LabelEncoder()
category_le = LabelEncoder()
gender_le = LabelEncoder()
city_le = LabelEncoder()
state_le = LabelEncoder()

# Fit and transform on full data
df['merchant'] = merchant_le.fit_transform(df['merchant'])
df['category'] = category_le.fit_transform(df['category'])
df['gender'] = gender_le.fit_transform(df['gender'])
df['city'] = city_le.fit_transform(df['city'])
df['state'] = state_le.fit_transform(df['state'])


In [7]:
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)


In [8]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9947905279381557

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00    368549
           1       0.00      0.00      0.00      1930

    accuracy                           0.99    370479
   macro avg       0.50      0.50      0.50    370479
weighted avg       0.99      0.99      0.99    370479


Confusion Matrix:
 [[368549      0]
 [  1930      0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [9]:
print(X.columns)
print(len(X.columns))


Index(['merchant', 'category', 'amt', 'gender', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long'],
      dtype='object')
13


In [10]:
print("\n--- USER INPUT for Fraud Prediction ---")

# User Inputs
merchant_input = input("Enter merchant name: ")
category_input = input("Enter category: ")
amt = float(input("Enter transaction amount: "))
gender_input = input("Enter gender (M/F): ")
city_input = input("Enter city: ")
state_input = input("Enter state: ")
zip_code = int(input("Enter zip code: "))
lat = float(input("Enter latitude: "))
long = float(input("Enter longitude: "))
city_pop = int(input("Enter city population: "))
unix_time = int(input("Enter unix time: "))
merch_lat = float(input("Enter merchant latitude: "))
merch_long = float(input("Enter merchant longitude: "))

#  Safe transform function for unknown values
def safe_transform(le, val, default_code):
    if val in le.classes_:
        return le.transform([val])[0]
    else:
        return default_code

# Encode categorical inputs safely
merchant = safe_transform(merchant_le, merchant_input, 9999)
category = safe_transform(category_le, category_input, 999)
gender = safe_transform(gender_le, gender_input, 99)
city = safe_transform(city_le, city_input, 9999)
state = safe_transform(state_le, state_input, 999)

#  Construct DataFrame for prediction
input_data = pd.DataFrame([[merchant, category, amt, gender, city, state,
                            zip_code, lat, long, city_pop, unix_time,
                            merch_lat, merch_long]],
                          columns=[
                              'merchant', 'category', 'amt', 'gender', 'city',
                              'state', 'zip', 'lat', 'long', 'city_pop',
                              'unix_time', 'merch_lat', 'merch_long'
                          ])

# Predict fraud
prediction = model.predict(input_data)[0]
print("\n Transaction is", " FRAUD" if prediction == 1 else " LEGITIMATE")


--- USER INPUT for Fraud Prediction ---

 Transaction is  LEGITIMATE
