# Notebook for building an XGBoost model for the Sendgrid sign up dataset
***
**Jake Mitchell Scott Schubert**

Initially using ______

In [57]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


In [68]:
train_path = 'data/signup_train_data.csv'
test_path = 'data/signup_test_data.csv'


# Load the data into a DataFrame 
df = pd.read_csv(train_path, low_memory=False)
test_df = pd.read_csv(test_path, low_memory=False)

# Split into X and Y
dfLabels = df.pop("label")
#print (df.head())

#Temporarily parse non numerical data
frames_to_remove = ['zip', 'city', 'website', 'state', 'country', 'registration_ip', 'company','multifactor_country_code','created_at','lead_source','marketing_channel','volume','user_persona','initial_package','employee_count','geolocation_notes','name_notes','ip_notes','community_notes','email_notes','activity_notes','fingerprint_notes']
df = df.drop(frames_to_remove, axis = 1)
test_df = test_df.drop(frames_to_remove, axis = 1)

df.head()


Unnamed: 0,id,banned_ip,ip_count,is_authy_verified,banned_email,is_transactional,is_marketing,is_behavioral,is_oem,risk,...,ip_cluster,email_cluster,mfa_required,mfa_completed,whitelabel_required,whitelabel_completed,payment_required,payment_completed,profile_completed,email_completed
0,0,6,8,0,1,0,0,0,0,-100,...,False,False,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1,5,5,0,1,1,0,0,0,-90,...,False,False,,,,,,,,
2,2,0,1,0,0,1,0,0,0,-44,...,False,False,,,,,,,,
3,3,4,4,0,2,0,1,0,0,-72,...,False,False,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,4,0,1,0,0,1,0,0,0,-34,...,False,False,,,,,,,,


In [65]:
def xgb_basic(seed_in, TO_TEST):
    # split data into train and test sets
    seed = seed_in
    test_size = 0.5
    X_train, X_test, y_train, y_test = train_test_split(df, dfLabels, test_size=test_size, random_state=seed)

    # fit model no training data
    model = XGBClassifier()
    model.fit(X_train, y_train)
    
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("xgb - Seed: %i    -  Accuracy: %.2f%%" % (seed,accuracy * 100.0))
    
    if TO_TEST:
        #make predictions for train data
        y_pred = model.predict(test_df)
        predictions = [round(value) for value in y_pred]

        d = {'id':test_df['id'], 'label':predictions}
        preds = pd.DataFrame(data=d)

        preds.to_csv('predictions.csv', index=False)

In [70]:
#Random forrest approach 

def rf(seed, TO_TEST):
    seed = seed
    df2 = df
    df2 = df2.fillna(0)
    X_train, X_test, y_train, y_test = train_test_split(df2, dfLabels, test_size=0.25, random_state=seed)

    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(X_train, y_train)
    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                max_depth=10, max_features='auto', max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=1, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=27, n_jobs=1,
                oob_score=False, random_state=0, verbose=0, warm_start=False)




    # Use the forest's predict method on the test data
    predictions = clf.predict(X_test)
    predictions = [round(value) for value in predictions]
    accuracy = accuracy_score(y_test, predictions)
    print("RF - Seed: %i    -  Accuracy: %.2f%%" % (seed,accuracy * 100.0))
    
    if TO_TEST:
        test_df2 = test_df
        test_df2 = test_df2.fillna(0)
        
        #make predictions for train data
        y_pred = clf.predict(test_df2)
        predictions = [round(value) for value in y_pred]

        d = {'id':test_df['id'], 'label':predictions}
        preds = pd.DataFrame(data=d)

        preds.to_csv('predictions.csv', index=False)

In [71]:
#xgb_basic(31,False)

#for x in range (1,10):
#    xgb_basic(x, False)


rf(1, True)

#for x in range (1,10):
#    rf(x, False)

RF - Seed: 1    -  Accuracy: 94.46%
