In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.preprocessing import OneHotEncoder


In [2]:
# Step 1: Load the dataset
train_data = pd.read_csv("fraudTrain.csv")
test_data = pd.read_csv("fraudTest.csv")

In [6]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [9]:
# Handle missing values (if any) - Replace missing values with the mean for numerical columns
numerical_columns = ['amt', 'lat', 'long', 'city_pop']
train_data[numerical_columns] = train_data[numerical_columns].fillna(train_data[numerical_columns].mean())
test_data[numerical_columns] = test_data[numerical_columns].fillna(train_data[numerical_columns].mean())  # Fill missing values with the training data mean


In [8]:
# Encode categorical variables (one-hot encoding)
categorical_columns = ['merchant', 'category', 'gender']
train_data = pd.get_dummies(train_data, columns=categorical_columns, drop_first=True)
test_data = pd.get_dummies(test_data, columns=categorical_columns, drop_first=True)


In [10]:
# Normalize numerical features (using StandardScaler)
scaler = StandardScaler()
numerical_columns = ['amt', 'lat', 'long', 'city_pop']
train_data[numerical_columns] = scaler.fit_transform(train_data[numerical_columns])
test_data[numerical_columns] = scaler.transform(test_data[numerical_columns])


In [12]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,amt,first,last,street,city,state,zip,...,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
0,0,2019-01-01 00:00:18,2703186189652095,-0.407826,Jennifer,Banks,561 Perry Cove,Moravian Falls,NC,28654,...,0,0,0,1,0,0,0,0,0,0
1,1,2019-01-01 00:00:44,630423337322,0.230039,Stephanie,Gill,43039 Riley Greens Suite 393,Orient,WA,99160,...,0,0,0,0,0,0,0,0,0,0
2,2,2019-01-01 00:00:51,38859492057661,0.934149,Edward,Sanchez,594 White Dale Suite 530,Malad City,ID,83252,...,0,0,0,0,0,0,0,0,0,1
3,3,2019-01-01 00:01:16,3534093764340240,-0.158132,Jeremy,White,9443 Cynthia Court Apt. 038,Boulder,MT,59632,...,0,0,0,0,0,0,0,0,0,1
4,4,2019-01-01 00:03:06,375534208663984,-0.177094,Tyler,Garcia,408 Bradley Rest,Doe Hill,VA,24433,...,0,0,0,0,1,0,0,0,0,1


In [11]:
train_data.isnull().sum()

Unnamed: 0                0
trans_date_trans_time     0
cc_num                    0
amt                       0
first                     0
                         ..
category_personal_care    0
category_shopping_net     0
category_shopping_pos     0
category_travel           0
gender_M                  0
Length: 726, dtype: int64

In [13]:
train_data.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,...,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,...,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,-1.406616e-17,48800.67,-2.347747e-16,-4.436413e-15,4.588432e-16,1349244000.0,38.53734,-90.22646,...,0.06623017,0.09494669,0.08717296,0.04880714,0.0614302,0.06999287,0.07522548,0.08997783,0.03123913,0.4525513
std,374318.0,1.308806e+18,1.0,26893.22,1.0,1.0,1.0,12841280.0,5.109788,13.77109,...,0.2486841,0.2931414,0.2820885,0.2154647,0.2401178,0.2551351,0.2637549,0.2861501,0.1739634,0.4977437
min,0.0,60416210000.0,-0.4325897,1257.0,-3.646814,-5.483362,-0.2940871,1325376000.0,19.02779,-166.6712,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,324168.5,180042900000000.0,-0.3786337,26237.0,-0.771724,-0.4776241,-0.2917027,1338751000.0,34.73357,-96.89728,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,648337.0,3521417000000000.0,-0.1424127,48174.0,0.1608963,0.1998271,-0.2860297,1349250000.0,39.36568,-87.43839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,972505.5,4642255000000000.0,0.07977349,72042.0,0.6703917,0.7317598,-0.2268423,1359385000.0,41.95716,-80.2368,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1296674.0,4.992346e+18,180.1352,99783.0,5.547035,1.619007,9.332066,1371817000.0,67.51027,-66.9509,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
.fillna(X.mean(), inplace=True)

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [7]:
file['is_fraud'].value_counts()

0    1289169
1       7506
Name: is_fraud, dtype: int64

In [9]:
normal=file[file.is_fraud==0]

In [10]:
fraud=file[file.is_fraud==1]

In [11]:
print(normal.shape)

(1289169, 23)


In [13]:
normal.amt.describe()

count    1.289169e+06
mean     6.766711e+01
std      1.540080e+02
min      1.000000e+00
25%      9.610000e+00
50%      4.728000e+01
75%      8.254000e+01
max      2.894890e+04
Name: amt, dtype: float64

In [14]:
file.groupby('is_fraud').mean()

Unnamed: 0_level_0,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,648473.169029,4.172901e+17,67.66711,48805.107481,38.536888,-90.228142,88775.228137,1349249000.0,38.536659,-90.228274
1,624949.724354,4.003577e+17,531.320092,48038.714229,38.663609,-89.916041,97276.763256,1348389000.0,38.653901,-89.915808


In [15]:
normal_sample=normal.sample(n=492)

In [16]:
new_file=pd.concat([normal_sample,fraud],axis=0)

In [17]:
new_file.head(10)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
223942,223942,2019-04-22 23:36:44,2242542703101233,fraud_Swift PLC,kids_pets,22.24,Samuel,Jenkins,M,43235 Mckenzie Views Apt. 837,...,38.4921,-85.4524,564,Pensions consultant,1996-04-10,194962ce574dd01a03c9de1e2c8347be,1335137804,37.515297,-84.892945,0
191740,191740,2019-04-08 20:08:57,213125815021702,"fraud_Walter, Hettinger and Kessler",personal_care,2.46,Adam,Kirk,M,40847 Stark Junctions,...,42.074,-74.453,397,Psychiatrist,1931-09-12,71aa687cff9bc8607f0aa13bb633d4a7,1333915737,42.065915,-73.662801,0
422519,422519,2019-07-08 18:34:12,4800395067176717,fraud_Jacobi Inc,health_fitness,130.35,Daniel,Owens,M,88794 Mandy Lodge Apt. 874,...,41.6964,-96.9858,1063,Research scientist (maths),1928-04-02,d1064b95773e2dd3499ad175a187f97c,1341772452,42.37462,-96.225112,0
299358,299358,2019-05-26 12:55:44,3599237318576484,"fraud_Hahn, Bahringer and McLaughlin",personal_care,107.91,Jessica,Smith,F,06808 Ryan Valleys,...,36.8421,-85.5396,341,Insurance risk surveyor,1932-11-19,c3d428c9eeae7f5f9356988843f2a230,1338036944,35.97739,-85.608071,0
343834,343834,2019-06-10 19:36:26,180042946491150,fraud_Altenwerth-Kilback,home,102.52,Charles,Robles,M,3337 Lisa Divide,...,27.7898,-82.7243,341043,"Engineer, land",1989-02-28,4a5285e95aa9f1d322036db4a6b5eef8,1339356986,27.069619,-83.04876,0
1221244,1221244,2020-05-25 15:06:25,4292743669224718067,fraud_Hudson-Grady,shopping_pos,403.37,Michael,Williams,M,35822 Clayton Street Apt. 679,...,38.2674,-76.4954,5927,Art therapist,1973-06-09,5aac94ee86cbe99480a5e8452e30bcef,1369494385,38.321369,-77.297218,0
808887,808887,2019-12-06 18:28:29,38588538868506,fraud_Veum-Koelpin,travel,9.44,Jacqueline,Curry,F,3047 Jeff Place,...,30.1886,-103.2214,498,Lexicographer,1990-11-23,3071ea39dc53ef20f7c4f385dd343e88,1354818509,30.776974,-104.159539,0
563326,563326,2019-08-27 19:36:35,4424338559877976,"fraud_Reinger, Weissnat and Strosin",food_dining,37.11,Denise,Barnett,F,23220 Eaton Harbors,...,40.813,-83.4196,118,Private music teacher,1957-11-12,0791b9245562d9d8b827918c89281abd,1346096195,39.847988,-83.069263,0
496068,496068,2019-08-04 10:17:52,36722699017270,"fraud_Swift, Bradtke and Marquardt",grocery_net,62.08,Jessica,Perez,F,8172 Robertson Parkways Suite 072,...,33.2887,-111.0985,2872,Petroleum engineer,1987-10-28,0fcdc38041b7571673655cb4cc1fb4d4,1344075472,33.069562,-111.647663,0
721713,721713,2019-11-04 05:03:49,3534718226968689,"fraud_Connelly, Reichert and Fritsch",gas_transport,71.31,Lisa,Lopez,F,32343 Saunders Course,...,37.3712,-89.1349,2263,Scientific laboratory technician,1984-09-13,340bb2372f548ddc7af46c392daab680,1352005429,37.157272,-88.347225,0


In [18]:
new_file['is_fraud'].value_counts()

1    7506
0     492
Name: is_fraud, dtype: int64

In [19]:
new_file.groupby('is_fraud').mean()

Unnamed: 0_level_0,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,656399.223577,4.68054e+17,68.064634,49907.418699,38.533626,-91.018275,97409.674797,1349529000.0,38.502217,-90.970559
1,624949.724354,4.003577e+17,531.320092,48038.714229,38.663609,-89.916041,97276.763256,1348389000.0,38.653901,-89.915808


In [14]:
X=train_data.drop(columns='is_fraud',axis=1)

In [15]:
Y=train_data['is_fraud']

In [16]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [17]:
model = LogisticRegression(max_iter=1000)  # Increase the max_iter value


In [19]:
X_train = X_train.drop('trans_date_trans_time', axis=1)


In [25]:
print(X_train.columns)


Index(['Unnamed: 0', 'cc_num', 'amt', 'first', 'last', 'street', 'city',
       'state', 'zip', 'lat',
       ...
       'category_health_fitness', 'category_home', 'category_kids_pets',
       'category_misc_net', 'category_misc_pos', 'category_personal_care',
       'category_shopping_net', 'category_shopping_pos', 'category_travel',
       'gender_M'],
      dtype='object', length=724)


In [22]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,first,last,street,city,state,zip,lat,...,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
584742,584742,4908846471916297,-0.052465,Lauren,Torres,03030 White Lakes,Grandview,TX,76050,-1.233247,...,0,0,0,0,0,0,0,0,0,0
1217264,1217264,4128730454058057622,-0.186638,Monique,Martin,68276 Matthew Springs,Ratcliff,TX,75858,-1.409495,...,1,0,0,0,0,0,0,0,0,0
813113,813113,3501509250702469,-0.427974,Frank,Anderson,0611 Stafford Valley Suite 504,Naples,FL,34112,-2.446748,...,0,0,0,0,0,1,0,0,0,1
1012494,1012494,4355790796238264643,-0.276461,Tanner,Davis,2632 Stevens Light Apt. 213,Payson,IL,62360,0.251719,...,0,0,0,0,0,0,0,0,0,1
727914,727914,3567527758368741,-0.3132,Amanda,Vance,14601 Downs Skyway Apt. 440,Sterling City,TX,76951,-1.320484,...,0,0,0,0,0,0,0,1,0,0


In [29]:
# Drop columns with string values
cols_to_drop = X_train.select_dtypes(include=['object']).columns
X_train = X_train.drop(cols_to_drop, axis=1)


In [30]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,...,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M
584742,584742,4908846471916297,-0.052465,76050,-1.233247,-0.509392,-0.274707,1346819396,31.955029,-96.403268,...,0,0,0,0,0,0,0,0,0,0
1217264,1217264,4128730454058057622,-0.186638,75858,-1.409495,-0.351446,-0.294021,1369400366,31.326306,-95.649501,...,1,0,0,0,0,0,0,0,0,0
813113,813113,3501509250702469,-0.427974,34112,-2.446748,0.617065,0.619883,1354896658,25.70909,-82.198043,...,0,0,0,0,0,1,0,0,0,1
1012494,1012494,4355790796238264643,-0.276461,62360,0.251719,-0.075322,-0.288679,1361476207,39.48378,-91.871868,...,0,0,0,0,0,0,0,0,0,1
727914,727914,3567527758368741,-0.3132,76951,-1.320484,-0.783146,-0.290378,1352276688,32.027674,-101.959026,...,0,0,0,0,0,0,0,1,0,0


In [31]:
model.fit(X_train,Y_train)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [34]:
cols_to_drop = X_test.select_dtypes(include=['object']).columns
X_test = X_test.drop(cols_to_drop, axis=1)

In [35]:

# Step 6: Model Evaluation

# Make predictions on the testing data
y_pred = model.predict(X_test)


In [36]:

# Calculate accuracy
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")



Accuracy: 0.99


In [37]:
# Create a confusion matrix
confusion_mat = confusion_matrix(Y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)


Confusion Matrix:
[[257834      0]
 [  1501      0]]


In [38]:
# Generate a classification report
classification_rep = classification_report(Y_test, y_pred)
print("Classification Report:")
print(classification_rep)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    257834
           1       0.00      0.00      0.00      1501

    accuracy                           0.99    259335
   macro avg       0.50      0.50      0.50    259335
weighted avg       0.99      0.99      0.99    259335



  _warn_prf(average, modifier, msg_start, len(result))
