In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from geopy import distance
import geopandas
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

import keras 
from keras.models import Sequential
from keras.layers import Dense, Dropout
import tensorflow as tf
from keras.utils import np_utils

# Assignment 4 - Simple Neural Networks

For this assigment you'll do a realistic task - predicting fraud from transaction data. 
### Some Things to Note

<ul>
<li> The dataset is imbalanced. See: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data for some ideas
<li> The locations, time, dob all likely aren't super useful on their own, but can be made into something more useful without much code or trouble. Think about how it may be useful to represent them. The data doesn't have missing rows, so this is the main data prep portion. 
<li> With respect to the above, and the other data here, we have a lot of rows of data. That means that we can generally handle data that is reasonably wide...
</ul>

### Deliverables

Your final goal is to produce a function that can be called to classify a transaction:
<ul>
<li> Please submit two .ipynb files - one where you did your work, and another that can use your model to make predictions. 
<li> In that prediction file, please ensure:
    <ul>
    <li> You have a function where I can load a file, and the end result is a classificaiton matrix of your prediction accuracy. 
    <li> You load a trained model. There's no training here. 
    <li> Any data prep stuff that is needed for your data should be built in here. I'm going to run a test file that is the exact same setup as the training data.
    <li> I should be able to open the prediction file, load the test data, and click RUN ALL and things should work. 
    <li> In addition to that, please include a short (~1-2 paragraph) description of what you did. Include anything that was innovative/different as well as a note on:
        <ul>
        <li> Any imbalanced data steps. 
        <li> Treatment of the location and time variables. What did you do to them?
        <li> Model structure (layers/size)
        <li> Any optimization steps included - regularization, dropouts, feature selection, etc...
        </ul>
    </ul>
</ul>

### Grades

The grade breakdown is as follows:

<ul>
<li> Code preduces predictions - 40
<li> Accuracy - 30
<li> Explaination - 20
<li> Balance/variable transformations - 10
</ul>



In [4]:
# Load some data
df = pd.read_csv("https://jrssbcrsefilesnait.blob.core.windows.net/3950data1/fraudTrain.csv.zip")
df.drop(columns={"Unnamed: 0"}, inplace=True)
df.head()


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [5]:
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
trans_date_trans_time,1296675.0,1274791.0,2019-04-22 16:02:01,4.0,,,,,,,
cc_num,1296675.0,,,,4.17192042079641e+17,1.3088064470007892e+18,60416207185.0,180042946491150.0,3521417320836166.0,4642255475285942.0,4.992346398065154e+18
merchant,1296675.0,693.0,fraud_Kilback LLC,4403.0,,,,,,,
category,1296675.0,14.0,gas_transport,131659.0,,,,,,,
amt,1296675.0,,,,70.351035,160.316039,1.0,9.65,47.52,83.14,28948.9
first,1296675.0,352.0,Christopher,26669.0,,,,,,,
last,1296675.0,481.0,Smith,28794.0,,,,,,,
gender,1296675.0,2.0,F,709863.0,,,,,,,
street,1296675.0,983.0,0069 Robin Brooks Apt. 695,3123.0,,,,,,,
city,1296675.0,894.0,Birmingham,5617.0,,,,,,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   first                  1296675 non-null  object 
 6   last                   1296675 non-null  object 
 7   gender                 1296675 non-null  object 
 8   street                 1296675 non-null  object 
 9   city                   1296675 non-null  object 
 10  state                  1296675 non-null  object 
 11  zip                    1296675 non-null  int64  
 12  lat                    1296675 non-null  float64
 13  long                   1296675 non-null  float64
 14  city_pop          

In [24]:
# Convert the trans_date_trans_time column to a datetime object
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Extract the date (day, month, and year) from the datetime object
df['trans_date'] = df['trans_date_trans_time'].dt.date

# Drop the original trans_date_trans_time column
df.drop(columns=['trans_date_trans_time'], inplace=True)

In [25]:
df.shape

(1296675, 23)

In [26]:
df.isnull().sum()

Unnamed: 0    0
cc_num        0
merchant      0
category      0
amt           0
first         0
last          0
gender        0
street        0
city          0
state         0
zip           0
lat           0
long          0
city_pop      0
job           0
dob           0
trans_num     0
unix_time     0
merch_lat     0
merch_long    0
is_fraud      0
trans_date    0
dtype: int64

### Deal with Lat/Lon

We can utilize lat/lon of the home and merchant in a useful way?

Note: I left the section headers in from when I did it. You can remove them if you want. 

### Deal with Time

Can we make date/time and the date of birth into something useful?

### Check Target Balance

In [27]:
from imblearn.over_sampling import RandomOverSampler

# Define the oversampling strategy
oversample = RandomOverSampler(sampling_strategy=0.5)

# Split the dataset into input features and target variable
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Oversample the minority class
X_resampled, y_resampled = oversample.fit_resample(X, y)

# Count the number of instances in each class
num_fraudulent = len(y_resampled[y_resampled == 1])
num_non_fraudulent = len(y_resampled[y_resampled == 0])

# Print the results
print("Number of fraudulent transactions:", num_fraudulent)
print("Number of non-fraudulent transactions:", num_non_fraudulent)


Number of fraudulent transactions: 644584
Number of non-fraudulent transactions: 1289169


#### DATA IMBALANCE:
From above the number shows that  the dataset is highly imbalanced. There are 644,584 fraudulent transactions and 1,289,169 non-fraudulent transactions. The number of non-fraudulent transactions is more than twice the number of fraudulent transactions. The fraudulent transactions are a small percentage of the overall transactions.
Therefore, it is important to address this imbalance in the dataset before building a machine learning model. This can be done using techniques such as oversampling, undersampling, or a combination of both. The goal is to balance the dataset so that the model has an equal number of instances from each class, which can help improve its performance on the minority class.

### Prepare Data


Oversampling: Oversampling is used here to creating more instances of the minority class (in this case, the fraudulent transactions) to balance the dataset. This can be done using techniques such as Random Oversampling, SMOTE (Synthetic Minority Over-sampling Technique).
#### SMOTE 
SMOTE(Synthetic Minority Over-sampling Technique) is an oversampling technique used to deal with imbalanced datasets, such as fraud detection datasets, where the minority class (in this case, fraudulent transactions) is significantly smaller than the majority class. The goal of SMOTE is to create new synthetic samples of the minority class by interpolating between existing minority class samples.<br>
SMOTE is useful in fraud detection because it helps to balance the dataset and improve the performance of machine learning models trained on imbalanced datasets. By oversampling the minority class, SMOTE ensures that the machine learning model has more examples of the minority class to learn from, which can help the model to better distinguish between the minority and majority classes. Additionally, by creating synthetic samples, SMOTE helps to avoid overfitting that can occur with simple oversampling techniques.

In [28]:
from imblearn.over_sampling import SMOTE

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   Unnamed: 0  1296675 non-null  int64  
 1   cc_num      1296675 non-null  int64  
 2   merchant    1296675 non-null  object 
 3   category    1296675 non-null  object 
 4   amt         1296675 non-null  float64
 5   first       1296675 non-null  object 
 6   last        1296675 non-null  object 
 7   gender      1296675 non-null  object 
 8   street      1296675 non-null  object 
 9   city        1296675 non-null  object 
 10  state       1296675 non-null  object 
 11  zip         1296675 non-null  int64  
 12  lat         1296675 non-null  float64
 13  long        1296675 non-null  float64
 14  city_pop    1296675 non-null  int64  
 15  job         1296675 non-null  object 
 16  dob         1296675 non-null  object 
 17  trans_num   1296675 non-null  object 
 18  unix_time   1296675 no

In [31]:


from imblearn.over_sampling import SMOTE



# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Oversample the minority class using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the class distribution after oversampling
print("Number of fraudulent transactions:", sum(y_train_resampled == 1))
print("Number of non-fraudulent transactions:", sum(y_train_resampled == 0))


ValueError: could not convert string to float: 'fraud_Schumm PLC'

### Split Data

### Model