In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [131]:
X_train = pd.read_csv('data/X_train.csv')

In [132]:
merchant_stats = X_train[X_train['is_fraud'] == 0].groupby('merchant').agg(mean_amt=('amt', np.mean), std_amt=('amt', np.std)).to_dict()
fraud_category_rankings = {
    'shopping_net': 0,
    'grocery_pos': 1,
    'misc_net': 2,
    'shopping_pos': 3,
    'gas_transport': 4,
    'misc_pos': 5,
    'personal_care': 6,
    'home': 7,
    'kids_pets': 8,
    'entertainment': 9,
    'health_fitness': 10,
    'food_dining': 11,
    'grocery_net': 12,
    'travel': 13
}
def frauds_by_feature(feature):
    frauds_feature_stats = X_train[X_train['is_fraud'] == 1].groupby(by=feature).agg({'cc_num': 'count'})
    feature_stats = X_train.groupby(by=feature).agg({'cc_num': 'count'})
    fraud_rate_df = frauds_feature_stats.merge(feature_stats, how='outer', on=feature, suffixes=('_fraud', '_all'))
    fraud_rate_df['fraud_rate'] = fraud_rate_df['cc_num_fraud'].fillna(0)/fraud_rate_df['cc_num_all']
    fraud_rate_df.sort_values(by='fraud_rate', ascending=False, inplace=True)
    return fraud_rate_df

# frauds_by_city = frauds_by_feature('city').to_dict()
frauds_by_zip = frauds_by_feature('zip').to_dict()
frauds_by_cat = frauds_by_feature('category').to_dict()
frauds_by_merc = frauds_by_feature('merchant').to_dict()

In [133]:
def transform(df):
#     df['cat_fraud_rate'] = df['category'].map(frauds_by_cat['fraud_rate'])
    df['category'] = df['category'].map(fraud_category_rankings)
    df['hour'] = pd.to_datetime(df['unix_time'],unit='s').dt.hour
#     df['month'] = pd.to_datetime(df['unix_time'],unit='s').dt.month
#     df['is_night'] = df['hour'].map(lambda x: 1 if x in [21, 22, 23, 0, 1, 2, 3] else 0)
#     df['is_expensive'] = df['amt'].map(lambda x: 1 if x>300 else 0)
    dates = pd.to_datetime(df['unix_time'],unit='s').dt.date
    dobs = pd.to_datetime(df['dob']).dt.date
    df['age'] = (dates-dobs) / pd.Timedelta(days=365)
#     df['city_fraud_rate'] = df['city'].map(frauds_by_city['fraud_rate'])
    df['zip_fraud_rate'] = df['zip'].map(frauds_by_zip['fraud_rate'])
    df['merc_fraud_rate'] = df['merchant'].map(frauds_by_merc['fraud_rate'])
    
    def add_merchant_outlier_column(df):
        zipped_merchant_amt_column = pd.Series(zip(df['merchant'], df['amt']))
        def merchant_outlier_column(merchant_amt):
            merchant, amt = merchant_amt
            if merchant not in merchant_stats['mean_amt'] or np.isnan(merchant_stats['std_amt'][merchant]):
                return 0
            mn, stddev = merchant_stats['mean_amt'][merchant], merchant_stats['std_amt'][merchant]
            return (amt-mn)/stddev
        outlier_column = zipped_merchant_amt_column.map(merchant_outlier_column)
        df['merc_outlier'] = outlier_column

    add_merchant_outlier_column(df)
    
    return df

In [134]:
def prep_for_prediction(df):
    df = transform(df)
    return df[['category', 'hour', 'amt',
               'zip_fraud_rate', 'merc_fraud_rate', 'merc_outlier',
               'is_fraud']]
#     return df._get_numeric_data()

In [135]:
X_train = prep_for_prediction(X_train)

  dobs = pd.to_datetime(df['dob']).dt.date


In [136]:
X_train, X_test, Y_train, Y_test = train_test_split(
        X_train.drop(['is_fraud'], axis=1),
        X_train['is_fraud'],
        test_size=1/20,
        random_state=0
    )

In [137]:
model = RandomForestClassifier(class_weight='balanced', max_features=None).fit(X_train, Y_train)
# model = DecisionTreeClassifier(class_weight='balanced', max_features=None).fit(X_train, Y_train)

In [138]:
# check importances of each feature (for decision tree + random forest)
from pprint import pprint
pprint(sorted(list(zip(X_train.columns, model.feature_importances_)), key=(lambda x: (x[1], x[0])), reverse=True))

[('zip_fraud_rate', 0.6489403201753564),
 ('amt', 0.16620472763201152),
 ('merc_outlier', 0.08454432060608562),
 ('hour', 0.03657349616878997),
 ('merc_fraud_rate', 0.03312727318599082),
 ('category', 0.030609862231765622)]


In [139]:
predictions = model.predict(X_test)
print("Accuracy on testing set = ", accuracy_score(Y_test, predictions))
print("F1 score on testing set = ", f1_score(Y_test, predictions))

Accuracy on testing set =  0.9990540040307654
F1 score on testing set =  0.877005347593583


In [140]:
# target fraction of frauds
print((Y_train.sum() + Y_test.sum())/(len(X_test)+len(X_train)))

0.003860122487424268


In [141]:
predictions.sum() / len(predictions)

0.0036605930983424507

In [142]:
# load/transform testing data
X_predict = pd.read_csv('data/X_test.csv')
submission = X_predict[['Id']]
X_predict = prep_for_prediction(X_predict).drop(columns=['is_fraud'])

  dobs = pd.to_datetime(df['dob']).dt.date


In [143]:
predictions = model.predict(X_predict)
predictions.sum() / len(predictions)

0.003167062549485352

In [144]:
# export
submission['is_fraud'] = predictions
submission.to_csv('data/submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['is_fraud'] = predictions
