In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from website.pipeline import clean_data,score_model

In [2]:
raw = pd.read_json('data/data.json')
raw.head()

Unnamed: 0,acct_type,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,...,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state
0,fraudster_event,1266062400,3852,5,US,USD,0.0,"<p><a href=""http://s432.photobucket.com/albums...",gmail.com,1262739706,...,"[{'event_id': 527017, 'cost': 25.0, 'availabil...",36,1259613950,1,717 Washington Avenue,US,25.777471,-80.133433,INK Nightclub - South Beach,FL
1,premium,1296720000,3499,0,US,USD,1.0,"<p>Join us for a quick, one-night, community-b...",ruf.org,1293832670,...,"[{'event_id': 786878, 'cost': 35.0, 'availabil...",149,1280942776,3,,US,32.776566,-79.930922,"The Charleston, SC area",SC
2,premium,1296172800,2601,8,US,USD,1.0,"<h3><span class=""subcategory""><strong>Teacher ...",pvsd.k12.ca.us,1291090956,...,"[{'event_id': 787337, 'cost': 93.51, 'availabi...",214,1272559388,3,10100 Pioneer Blvd Suite 100,US,33.944201,-118.080419,Los Angeles County Office of Education,CA
3,premium,1388966400,12347,6,IE,EUR,1.0,"<p style=""margin-bottom: 1.3em; padding-bottom...",irishtabletennis.com,1360681570,...,"[{'event_id': 885645, 'cost': 25.0, 'availabil...",889,1283870102,3,,,,,,
4,premium,1297900800,2417,11,US,USD,0.0,<p>Writers and filmmakers need to understand t...,artsandbusinesscouncil.org,1291994666,...,"[{'event_id': 1114349, 'cost': 150.0, 'availab...",35,1288984065,3,One Marina Park Drive,US,42.353848,-71.044276,Fish & Richardson,MA


In [3]:
raw['fraud'] = raw.acct_type.apply(lambda x: ('fraud' in x)*1)
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14337 entries, 0 to 14336
Data columns (total 45 columns):
acct_type             14337 non-null object
approx_payout_date    14337 non-null int64
body_length           14337 non-null int64
channels              14337 non-null int64
country               14256 non-null object
currency              14337 non-null object
delivery_method       14321 non-null float64
description           14337 non-null object
email_domain          14337 non-null object
event_created         14337 non-null int64
event_end             14337 non-null int64
event_published       14238 non-null float64
event_start           14337 non-null int64
fb_published          14337 non-null int64
gts                   14337 non-null float64
has_analytics         14337 non-null int64
has_header            8928 non-null float64
has_logo              14337 non-null int64
listed                14337 non-null object
name                  14337 non-null object
name_length      

### Let's Create a DataFrame with the features that make sense for our model

In [4]:
features = raw[['body_length','user_age','sale_duration','fraud']]
features.fillna(0,inplace=True)
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14337 entries, 0 to 14336
Data columns (total 4 columns):
body_length      14337 non-null int64
user_age         14337 non-null int64
sale_duration    14337 non-null float64
fraud            14337 non-null int64
dtypes: float64(1), int64(3)
memory usage: 448.1 KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [5]:
features.fraud.value_counts()

0    13044
1     1293
Name: fraud, dtype: int64

### Fraud accounts for 9% of our data. This means if we called everything "fraud" we would get an accuracy score of 91%. Therefore, our model should do better.

In [6]:
X = features.drop('fraud',axis=1)
y = features.fraud

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score,precision_score, recall_score,roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3,random_state=29)

### Use pipeline to build the scaler into our model, this makes it easier for deployment later

In [9]:
gbc = Pipeline([('scaler', StandardScaler()),
                ('clf', GradientBoostingClassifier(learning_rate=.01,
                                                   random_state=29,verbose=1))
               ])

gbc.fit(X_train,y_train)
%timeit

      Iter       Train Loss   Remaining Time 
         1           0.5985            0.56s
         2           0.5894            0.55s
         3           0.5810            0.52s
         4           0.5731            0.50s
         5           0.5657            0.49s
         6           0.5587            0.49s
         7           0.5521            0.49s
         8           0.5458            0.48s
         9           0.5399            0.47s
        10           0.5342            0.46s
        20           0.4883            0.39s
        30           0.4551            0.33s
        40           0.4294            0.27s
        50           0.4089            0.23s
        60           0.3924            0.18s
        70           0.3786            0.14s
        80           0.3667            0.09s
        90           0.3562            0.05s
       100           0.3471            0.00s


In [10]:
y_pred = gbc.predict(X_test)
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))

0.8333333333333334
0.5091383812010444
0.9472338447233845
[[3880   39]
 [ 188  195]]


### Our model has a better accuracy than random, and decent precision, but in this scenario, the goal is to get a good recall score.

In [11]:
y_pred = score_model(gbc, X_test, y_test, threshold = .065)
print(confusion_matrix(y_test, y_pred))

Precision: 0.48
Recall: 0.83
Accuracy: 0.90
Threshold: 0.07
[[3571  348]
 [  66  317]]


#### With this threshold, we capture 82.8% of all fraudulent cases, this shall become our production model

In [13]:
import pickle
pickle.dump(gbc, open('src/gbc.pkl', 'wb'))