## Step by Step
* Import dependences
* Create global variables 
* Get train and test dataset from Kaggle [Kaggle Fraud Detection](https://www.kaggle.com/c/ieee-fraud-detection/data)
* Reduce memory of dataset
* Slipt dataset (fraud/normal)
* Show graph comparing data
* Separate dataset for training and validation
* Create Logistic Model 
* Create the prediction

### Import dependences

In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import reduce_mem;

In [2]:
TRAINING_IDENTITY_CSV_PATH ='dataset/train_identity.csv'
TRAINING_TRANSACTION_CSV_PATH ='dataset/train_transaction.csv'
TEST_IDENTITY_CSV_PATH ='dataset/test_identity.csv'
TEST_TRANSACTION_CSV_PATH ='dataset/test_transaction.csv'

### Get train and test dataset from Kaggle [Kaggle Fraud Detection](https://www.kaggle.com/c/ieee-fraud-detection/data)

In [3]:
df_train_identity = pd.read_csv(TRAINING_IDENTITY_CSV_PATH)
df_train_transaction = pd.read_csv(TRAINING_TRANSACTION_CSV_PATH)
df_train = pd.merge(df_train_transaction, df_train_identity, on='TransactionID', how='left')
df_train.shape

(590540, 434)

In [1]:
df_test_identity = pd.read_csv(TEST_IDENTITY_CSV_PATH)
df_test_transaction = pd.read_csv(TEST_TRANSACTION_CSV_PATH)

df_test = pd.merge(df_test_transaction, df_test_identity, on='TransactionID', how='left')
df_test.shape

NameError: name 'pd' is not defined

In [None]:
df_train_reduced_mem = reduce_mem.reduce_mem_usage(df_train)

In [None]:
df_test_reduced_mem = reduce_mem.reduce_mem_usage(df_test)

In [None]:
df_train = df_train_reduced_mem
df_test = df_test_reduced_mem

In [None]:
null_percent = df_train.isnull().sum()/df_train.shape[0]*100

cols_to_drop = np.array(null_percent[null_percent > 50].index)

cols_to_drop

In [None]:
df_train = df_train.drop(cols_to_drop, axis=1)
df_test = df_test.drop(cols_to_drop,axis=1)
df_train.columns

In [None]:
null_cols = df_train.columns[df_train.isna().any()].tolist()

for i in null_cols:
    print('data type of {} is {}'.format(i, str(df_train[i].dtype)))
    df_train[i] = df_train[i].replace(np.nan, df_train[i].mode()[0])
    print('Filled the null values of column {}'.format(i))
    print('--------------------------------------------')

In [None]:
null_cols = df_test.columns[df_test.isna().any()].tolist()

for i in null_cols:
    print('data type of {} is {}'.format(i, str(df_test[i].dtype)))
    df_test[i] = df_test[i].replace(np.nan, df_test[i].mode()[0])
    print('Filled the null values of column {}'.format(i))
    print('--------------------------------------------')

In [None]:
df_train.head()

In [None]:
x_train = df_train.drop('isFraud', axis=1)
y_train = df_train['isFraud']

In [None]:
cat_data = x_train.select_dtypes(include='object')
num_data = x_train.select_dtypes(exclude='object')

cat_cols = cat_data.columns.values
num_cols = num_data.columns.values

print('Categorical Columns : ',cat_cols)
print('Numerical Columns : ',num_cols)

In [None]:
x_train['TransactionAmt'] = x_train['TransactionAmt'].apply(np.log)
df_test['TransactionAmt'] = df_test['TransactionAmt'].apply(np.log)

In [None]:
for i in tqdm(cat_cols): 
    label = LabelEncoder()
    label.fit(list(x_train[i].values)+list(df_test[i].values))
    x_train[i] = label.transform(list(x_train[i].values))
    df_test[i] = label.transform(list(df_test[i].values))

In [None]:
x_train.head()

In [None]:
fig = plt.figure(figsize=(20,15))

j = 1
for i in cat_cols:
    if(i == 'P_emaildomain'):
        continue
    plt.subplot(3,3,j)
    sns.countplot(x=x_train[i], palette='winter_r')
    j = j + 1
    
plt.show()

In [None]:
sns.countplot(x=y_train, palette='gist_rainbow')
plt.title('Fraud or Not')
plt.show()

In [None]:
x_train = x_train.drop('TransactionDT', axis=1)
df_test = df_test.drop('TransactionDT', axis=1)

In [None]:
#x_train_final = x_train[final_columns]
#test_final = df_test[final_columns]
x_train_final = x_train
test_final = df_test
print(test_final.shape)

In [None]:
from sklearn import linear_model
filename = 'model.pkl'
logistic_model = linear_model.LogisticRegression()  
logistic_model.fit(x_train_final, y_train)
import pickle
pickle.dump(logistic_model, open(filename, 'wb'))

In [None]:
print(len(test_final.columns))
print(len(x_train_final.columns))

In [None]:
logistic_predictions = logistic_model.predict(test_final)

In [None]:
sub = pd.read_csv('dataset/sample_submission.csv')
sub['isFraud'] = logistic_predictions
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)