In [34]:


import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as sci
import sys, os, warnings, random, datetime
from sys import platform

#
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from tqdm import tqdm

#Gradeint decent models
import xgboost as xgb
import lightgbm as lgb

In [27]:
RED   = "\033[1;31m"  
BLUE  = "\033[1;34m"
RESET = "\033[0;0m"
BOLD    = "\033[;1m"
REVERSE = "\033[;7m"


def gen_data():
    #setting path  
    if platform == "win32" and os.getlogin() == 'zaine':
        data_path = 'C:/Users/zaine/OneDrive/Desktop/School/STAT/Reg_Proj/ieee-fraud-detection/'
        ScaleData = 0
    elif platform == "win32" and os.getlogin() == 'Avery':
        data_path = "C:/Users/Avery/Desktop/Applied_Regression/Datasets for reg/"
        ScaleData = 1
    elif platform == "darwin":
        data_path = "/Users/zain/Desktop/School/Reg_Proj/ieee-fraud-detection/"
        ScaleData = 2
        

    #a bunch of logic related to what system we are on
    if(ScaleData == 0):
        train_ident = pd.read_csv(data_path + 'train_identity.csv')
        test_ident = pd.read_csv(data_path + 'test_identity.csv')
        train_transaction = pd.read_csv(data_path + 'train_transaction.csv')
        test_transaction = pd.read_csv(data_path + 'test_transaction.csv')
        #Merging our 4 data sets into 2:
        train = train_transaction.merge(train_ident, on='TransactionID', how='left')
        test = test_transaction.merge(test_ident, on='TransactionID', how='left')
        
        cur_working = os.path.dirname(os.getcwd()) 
        fptrain = str(cur_working) + "\dsProject\\train_sample.csv"
        fptest = str(cur_working) + "\dsProject\\test_sample.csv"
        train_sample = pd.read_csv(fptrain)
        test_sample = pd.read_csv(fptest)
    elif(ScaleData == 1):
        cur_working = os.path.dirname(os.getcwd()) 
        train = pd.read_csv(data_path + "/train_sample.csv") #make this exact path to the csv
        test = pd.read_csv(data_path + "/test_sample.csv") #make this exact path to the csv
        cur_working = os.path.dirname(os.getcwd()) 
        fptrain = str(cur_working) + "\dsProject\\train_sample.csv"
        fptest = str(cur_working) + "\dsProject\\test_sample.csv"
        train_sample = pd.read_csv(fptrain)
        test_sample = pd.read_csv(fptest)
    elif(ScaleData == 2):
        train = pd.read_csv(data_path + "/train_sample.csv")
        test = pd.read_csv(data_path + "/test_sample.csv")
        train_sample = pd.read_csv(data_path + "/train_sample.csv")
        test_sample = pd.read_csv(data_path + "/train_sample.csv")
    else:
        sys.stdout.write(RED)
        error_code= input("Error: Data path not found do you want to regenearte the train and test sets in the current working directory?[y/n]: ")
        if error_code == 'y':
            print("Warning: Data path not found regenerating data in current script working directory!")
            sys.stdout.write(RESET)
            print("\n==============================================================")
            train_ident = pd.read_csv(data_path + 'train_identity.csv')
            test_ident = pd.read_csv(data_path + 'test_identity.csv')
            train_transaction = pd.read_csv(data_path + 'train_transaction.csv')
            test_transaction = pd.read_csv(data_path + 'test_transaction.csv')
            #Merging our 4 data sets into 2:
            print("Merging Data")
            train = train_transaction.merge(train_ident, on='TransactionID', how='left')
            test = test_transaction.merge(test_ident, on='TransactionID', how='left')
            print("Sampling Data")
            train_sample = train.sample(frac = 0.1, random_state = 120)
            test_sample = test.sample(frac = 0.1, random_state=120)
            cur_working = os.path.dirname(os.getcwd()) 
            print("Exporting Data To Git Path:"+ str(cur_working))
            if platform == 'darwin':
                fptrain = str(cur_working) + "/dsProject/train_sample.csv"
                fptest = str(cur_working) + "/dsProject/test_sample.csv"
            else:
                fptrain = str(cur_working) + "\dsProject\\train_sample.csv"
                fptest = str(cur_working) + "\dsProject\\test_sample.csv"
            print("Generating csvs @:\n" + fptrain+"\n"+ fptest )
            train_sample.to_csv(fptrain)
            test_sample.to_csv(fptest)
            print("Setting Pandas Objects at above csv directories")
            train = pd.read_csv(fptrain)
            test = pd.read_csv(fptest)
            sys.stdout.write(BLUE)
            print("\n Process Complete")
            sys.stdout.write(RESET)
            print("==============================================================")
        elif error_code == 'n':
            print("Fatal Error Rerun Chunk")
        else:
            sys.exit(0)

def set_seed(s=0):
    random.seed(s)
    os.environ['PYTHONHASHSEED'] = str(s)
    np.random.seed(s)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [48]:
#import data

set_seed(s=1)

train_transaction  = pd.read_csv("C:\\Users\\zaine\\OneDrive\\Desktop\\School\\STAT\\Reg_Proj\\ieee-fraud-detection\\train_transaction.csv", index_col = 'TransactionID')
test_transaction = pd.read_csv("C:\\Users\\zaine\\OneDrive\\Desktop\\School\\STAT\\Reg_Proj\\ieee-fraud-detection\\test_transaction.csv", index_col = 'TransactionID')

train_identity = pd.read_csv("C:\\Users\\zaine\\OneDrive\\Desktop\\School\\STAT\\Reg_Proj\\ieee-fraud-detection\\train_identity.csv", index_col = 'TransactionID')
test_identity = pd.read_csv("C:\\Users\\zaine\\OneDrive\\Desktop\\School\\STAT\\Reg_Proj\\ieee-fraud-detection\\test_identity.csv", index_col = 'TransactionID')

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)


y_train = train['isFraud'].copy()
y_test = train['isFraud'].copy()

#Drop target from train set and fill in Null values
F_Train = train.drop('isFraud', axis =1)
F_Test  = test.copy()



F_Train = F_Train.fillna(-1)
F_Test = F_Test.fillna(-1)
print("NA Varaibles Filled")

F_Train = F_Train.drop(columns=['id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38'])
F_Test = F_Test.drop(columns=['id-01', 'id-02', 'id-03', 'id-04', 'id-05', 'id-06', 'id-07', 'id-08', 'id-09', 'id-10', 'id-11', 'id-12', 'id-13', 'id-14', 'id-15', 'id-16', 'id-17', 'id-18', 'id-19', 'id-20', 'id-21', 'id-22', 'id-23', 'id-24', 'id-25', 'id-26', 'id-27', 'id-28', 'id-29', 'id-30', 'id-31', 'id-32', 'id-33', 'id-34', 'id-35', 'id-36', 'id-37', 'id-38'])


print("Dropped id cols")
F_Train = reduce_mem_usage(F_Train)
F_Test = reduce_mem_usage(F_Test)





#remove those pesky big sets from cache
del train_transaction, train_identity, test_transaction, test_identity
del train, test
print("Intial df's removed from cache")

Train Shape: (590540, 433)
Test Shape: (506691, 432)
NA Varaibles Filled
Dropped id cols
Mem. usage decreased to 569.17 Mb (68.3% reduction)
Mem. usage decreased to 498.38 Mb (67.7% reduction)
Intial df's removed from cache


In [29]:

for i in F_Train.columns:
    if F_Train[i].dtype =='object' or F_Test[i].dtype =='object':
        if i in F_Train.columns and i in F_Test.columns:
            lab = preprocessing.LabelEncoder()
            lab.fit(list(F_Train[i].values) + list(F_Test[i].values))
            F_Train[i] = lab.transform(list(F_Train[i].values))
            F_Test[i] = lab.transform(list(F_Test[i].values))
        else:
            sys.exit(0)

In [30]:



gBoostClassifier = xgb.XGBClassifier(
n_estimators = 500, 
max_depth = 9, 
learning_rate = 0.5,
subsample=0.9,
colsample_bytree = 0.9,
missing = -1,
random_state = 1,

)
gradModel = gBoostClassifier.fit(F_Train, y_train)



In [57]:
preds =  gradModel.predict(F_Test)
print(preds)

accuracy = accuracy_score(y_test, preds)

print("Accuracy: %.2f%%" % (accuracy * 100.0))

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:ProductCD: object, card4: object, card6: object, P_emaildomain: object, R_emaildomain: object, M1: object, M2: object, M3: object, M4: object, M5: object, M6: object, M7: object, M8: object, M9: object, DeviceType: object, DeviceInfo: object