In [None]:
import numpy as np 
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

### TRAINING DATASET

In [None]:
train_transaction_df = pd.read_csv('/Users/mayurdeo/Downloads/ieee-fraud-detection/train_transaction.csv')
train_identity_df = pd.read_csv('/Users/mayurdeo/Downloads/ieee-fraud-detection/train_identity.csv')

### PREPROSSING

In [None]:
df_train = train_transaction_df.merge(train_identity_df, on='TransactionID', how='left')

In [None]:
cat_cols = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'DeviceType', 'DeviceInfo',
] + [f'M{n}' for n in range(1, 10)] + [f'id_{n}' for n in range(12, 39)]
num_cols = list(set(df_train.columns) - set(cat_cols))

In [None]:
a = df_train[num_cols].isnull().any() ##getting the columns with NAN
train_null_num_cols = a[a].index

In [None]:
nas = {}              ####Replacing the nans with the median of the column
for n in train_null_num_cols:
    df_train[f'{n}_isna'] = df_train[n].isnull()
    median = df_train[n].median()
    df_train[n].fillna(median, inplace=True)
    nas[n] = median

In [None]:
integer_cols = []
for c in num_cols:
    try:
        if df_train[c].fillna(-1.0).apply(float.is_integer).all():
            integer_cols += [c]
    except Exception as e:
        print("error: ", c, e)

In [None]:
stats = df_train[integer_cols].describe().transpose()

In [None]:
###Reducing Memory Complexity
int8columns = stats[stats['max'] < 256].index
int16columns = stats[(stats['max'] >= 256) & (stats['max'] <= 32767)].index

In [None]:
for c in int8columns:
    df_train[c] = df_train[c].astype('int8')
    
for c in int16columns:
    df_train[c] = df_train[c].astype('int16')

In [None]:
df_train.memory_usage().sum()

### TESTING DATASET

In [None]:
test_transaction_df = pd.read_csv('/Users/mayurdeo/Downloads/ieee-fraud-detection/test_transaction.csv')  ####inputing test data
test_identity_df = pd.read_csv('/Users/mayurdeo/Downloads/ieee-fraud-detection/test_identity.csv')
df_test = test_transaction_df.merge(test_identity_df, on='TransactionID', how='left')

In [None]:
for k, v in nas.items():
    df_test[f'{k}_isna'] = df_test[k].isnull()
    df_test[k].fillna(v, inplace=True)

In [None]:
test_num_cols = list(set(num_cols) - set(['isFraud']))
a = df_test[test_num_cols].isnull().any()
test_null_num_cols = a[a].index

In [None]:
for n in test_null_num_cols:
    df_test[n].fillna(df_train[n].median(), inplace=True)

In [None]:
integer_cols = []                ####Reducing memory complexity
for c in test_num_cols:
    try:
        if df_test[c].fillna(-1.0).apply(float.is_integer).all():
            integer_cols += [c]
    except Exception as e:
        print("error: ", c, e)
stats = df_test[integer_cols].describe().transpose()
int8columns = stats[stats['max'] < 256].index
int16columns = stats[(stats['max'] >= 256) & (stats['max'] <= 32767)].index
for c in int8columns:
    df_test[c] = df_test[c].astype('int8')
    
for c in int16columns:
    df_test[c] = df_test[c].astype('int16')

In [None]:
### Categorical Variables
##we have columns with very high cardinality -- and since we have a large dataset, it's probably more practical to use label encoding which we'll do

In [None]:
for c in cat_cols: ##filling nan points with missing
    df_train[c] = df_train[c].fillna("missing")
    
for c in cat_cols:   
    df_test[c] = df_test[c].fillna("missing")

In [None]:
cats = {}
for c in cat_cols:
    df_train[c] = df_train[c].astype("category")
    df_train[c].cat.add_categories('unknown', inplace=True)
    cats[c] = df_train[c].cat.categories
    

In [None]:
for k, v in cats.items():
    df_test[k][~df_test[k].isin(v)] = 'unknown'

In [None]:
from pandas.api.types import CategoricalDtype

for k, v in cats.items():
    new_dtype = CategoricalDtype(categories=v, ordered=True)
    df_test[k] = df_test[k].astype(new_dtype)

In [None]:
for c in cat_cols:
    df_train[c] = df_train[c].cat.codes
    df_test[c] = df_test[c].cat.codes

In [None]:
idx = int(len(df_train) * 0.8)
training_set, validation_set = df_train[:idx], df_train[idx:]

In [None]:
y_train = training_set['isFraud']
X_train = training_set.drop('isFraud', axis=1)
y_valid = validation_set['isFraud']
X_valid = validation_set.drop('isFraud', axis=1)

In [None]:
training_sample = training_set[-100000:]
y_train_sample = training_sample['isFraud']
X_train_sample = training_sample.drop('isFraud', axis=1)

### RANDOMFOREST

In [None]:
model = RandomForestRegressor(
    n_estimators=400, max_features=0.3,
    min_samples_leaf=20, n_jobs=-1, verbose=1)

In [None]:
model.fit(X_train_sample, y_train_sample)

In [None]:
roc_auc_score(y_valid, preds_valid)

### LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression   
  
modellr = LogisticRegression(random_state = 0,solver='lbfgs') 
modellr.fit(X_train, y_train)

In [None]:
preds_valid = model.predict(X_valid)
roc_auc_score(y_valid, preds_valid)

In [None]:
N = 10
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_],axis=0)

# create a dataframe
importances_df = pd.DataFrame({'variable':X_train.columns, 'importance': importances})

top_N = importances_df.sort_values(by=['importance'], ascending=False).head(N)

sns.barplot(data = top_N, y = "variable", x = "importance", palette = 'GnBu_d')

### SVM

In [None]:
from sklearn import svm
clf = svm.SVC(decision_function_shape='ovo', gamma='auto')
clf.fit(X_train, y_train)

In [None]:
N = 10
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_],axis=0)

# create a dataframe
importances_df = pd.DataFrame({'variable':X_train.columns, 'importance': importances})

top_N = importances_df.sort_values(by=['importance'], ascending=False).head(N)

sns.barplot(data = top_N, y = "variable", x = "importance", palette = 'GnBu_d')