# Notebook for building an XGBoost model for the Sendgrid sign up dataset
***
**Jake Mitchell Scott Schubert**

Initially using XGBoost

In [1]:
#To do anything
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

#Xgboost
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#random forrest
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

#One-hot encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder


In [2]:
def one_hot_boi(col, expect_type):
    values = df[col]
    # integer encode
    label_encoder = LabelEncoder()
    values = label_encoder.fit_transform(values.astype(expect_type))
    integer_encoded = label_encoder.fit_transform(values)
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    return onehot_encoded


In [11]:
train_path = 'data/signup_train_data.csv'
test_path = 'data/signup_test_data.csv'


# Load the data into a DataFrame 
df = pd.read_csv(train_path, low_memory=False)
test_df = pd.read_csv(test_path, low_memory=False)

# Split into X and Y
dfLabels = df.pop("label")


#Feature engineering
'''
    Giving up on one-hot-encoding for now.  Not converting to 
    a friendly data format for our pandas dataFrame and I'm not 
    convinced it will assist very much and would rather invest time
    into feature engineering specific columns
state_ohe = one_hot_boi('state', 'str')
aoeu = pd.DataFrame(state_ohe.T)
df['state_ohe'] = 0
count = 0
for row in state_ohe:
    aoeu = pd.DataFrame(row)
    df['state_ohe'].append(aoeu)
    print(aoeu)
'''


#Feature Engineering

for dataframe in [df, test_df]:
    
    dataframe['employee_count_high'] = 0
    dataframe.loc[df['employee_count']  == '1 - 500', 'employee_count_high'] = 500
    dataframe.loc[df['employee_count']  == '501 - 1,000', 'employee_count_high'] = 1000
    dataframe.loc[df['employee_count']  == '1,001 - 5,000', 'employee_count_high'] = 5000
    dataframe.loc[df['employee_count']  == '5,000+', 'employee_count_high'] = 10000
    dataframe['volume_high'] = 0
    dataframe.loc[df['volume']  == '1 - 100,000', 'volume_high'] = 100000
    dataframe.loc[dataframe['volume']  == '100,001 - 2,500,000', 'volume_high'] = 2500000
    dataframe.loc[dataframe['volume']  == '2,500,001 - 10,000,000', 'volume_high'] = 10000000
    dataframe.loc[dataframe['volume']  == '10,000,000+', 'volume_high'] = 20000000
    dataframe.loc[dataframe['volume']  == '0 to 40k', 'volume_high'] = 40000
    dataframe.loc[dataframe['volume']  == '40k to 100k', 'volume_high'] = 100000
    dataframe['developer'] = 0
    dataframe.loc[dataframe['user_persona']  == 'Developer', 'developer'] = 1
    dataframe['ceo'] = 0
    dataframe.loc[dataframe['user_persona']  == 'CEO', 'ceo'] = 1
    dataframe['other'] = 0
    dataframe.loc[dataframe['user_persona']  == 'Other', 'other'] = 1
    dataframe['marketing'] = 0
    dataframe.loc[dataframe['user_persona']  == 'Marketing', 'marketing'] = 1
    dataframe['technical_support'] = 0
    dataframe.loc[dataframe['user_persona']  == 'Technical Support', 'technical_support'] = 1
    
    dataframe['name_notes_inv'] = 0
    dataframe.loc[dataframe['name_notes']  == 'Invalid', 'name_notes_inv'] = 1
    dataframe['name_notes_firstlast'] = 0
    dataframe.loc[dataframe['name_notes']  == 'First equals Last', 'name_notes_firstlast'] = 1
    dataframe['name_notes_inv_firstlast'] = 0
    dataframe.loc[dataframe['name_notes']  == 'Invalid|First equals Last', 'name_notes_inv_firstlast'] = 1
    dataframe['name_notes_sus_firstlast'] = 0
    dataframe.loc[dataframe['name_notes']  == 'Suspect|First equals Last', 'name_notes_sus_firstlast'] = 1
    dataframe['name_notes_sus'] = 0
    dataframe.loc[dataframe['name_notes']  == 'Suspect', 'name_notes_sus'] = 1
    dataframe['name_notes_inv_sus_firstlast'] = 0
    dataframe.loc[dataframe['name_notes']  == 'Invalid|Suspect|First equals Last', 'name_notes_inv_sus_firstlast'] = 1
    dataframe['name_notes_inv_sus'] = 0
    dataframe.loc[dataframe['name_notes']  == 'Invalid|Suspect', 'name_notes_inv_sus'] = 1
    dataframe['name_notes_tagbad'] = 0
    dataframe.loc[dataframe['name_notes']  == 'Tagged Bad', 'name_notes_tagbad'] = 1


#alternate attepmt
#UPDATE:  Works as intended however cannot be read in during learning algo
# They need "int, float, or bool" therefore this is a bus
'''name_note_ohe = one_hot_boi('name_notes', 'str')
nn_df = pd.DataFrame(index=range(0,len(df)),columns=['name_note_ohe'])
for i in range(0,len(df)):
    nn_df['name_note_ohe'][i] = name_note_ohe[i]
    if i % 1000 == 1:
        print(i)
    
df['name_note_ohe'] = nn_df['name_note_ohe']
'''

    


#Temporarily parse non numerical data
frames_to_remove = ['risk','zip', 'city', 'website', 'state', 'country', 'registration_ip', 'company','multifactor_country_code','created_at','lead_source','marketing_channel','volume','user_persona','initial_package','employee_count','geolocation_notes','name_notes','ip_notes','community_notes','email_notes','activity_notes','fingerprint_notes']
df = df.drop(frames_to_remove, axis = 1)
test_df = test_df.drop(frames_to_remove, axis = 1)

print(df.head())


   id  banned_ip  ip_count  is_authy_verified  banned_email  is_transactional  \
0   0          6         8                  0             1                 0   
1   1          5         5                  0             1                 1   
2   2          0         1                  0             0                 1   
3   3          4         4                  0             2                 0   
4   4          0         1                  0             0                 1   

   is_marketing  is_behavioral  is_oem  geolocation_risk        ...          \
0             0              0       0                 0        ...           
1             0              0       0                 0        ...           
2             0              0       0               -16        ...           
3             1              0       0               -14        ...           
4             0              0       0               -16        ...           

   marketing  technical_support  name_

In [None]:
def get_label(X):
    '''Takes a row of X inputs and returns the predicted label'''
    if X["banned_ip"]: #this alone is ~92%
        return 1
    return 0

def bannedIP(seed_in, TO_TEST):
    # split data into train and test sets
    seed = seed_in
    test_size = 0.5
    X_train, X_test, y_train, y_test = train_test_split(df, dfLabels, test_size=test_size, random_state=seed)

    # fit model no training data

    # make predictions for test data
    y_pred = [get_label(x) for i,x in X_test.iterrows()]
    predictions = [round(value) for value in y_pred]

    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("xgb - Seed: %i    -  Accuracy: %.2f%%" % (seed,accuracy * 100.0))

    if TO_TEST:
        #make predictions for train data
        y_pred = [get_label(x) for i,x in test_df.iterrows()]
        predictions = [round(value) for value in y_pred]

        d = {'id':test_df['id'], 'label':predictions}
        preds = pd.DataFrame(data=d)

        preds.to_csv('predictions.csv', index=False)


In [None]:
#XGBoost approach 

def xgb(seed_in, TO_TEST):
    # split data into train and test sets
    seed = seed_in
    test_size = 0.7
    X_train, X_test, y_train, y_test = train_test_split(df, dfLabels, test_size=test_size, random_state=seed)

    # fit model no training data
    model = XGBClassifier(depth = 4, eta = 0.01, n_estimators = 120)
    model.fit(X_train, y_train)
    
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("xgb - Seed: %i    -  Accuracy: %.2f%%" % (seed,accuracy * 100.0))
    
    if TO_TEST:
        #make predictions for train data
        y_pred = model.predict(test_df)
        predictions = [round(value) for value in y_pred]

        d = {'id':test_df['id'], 'label':predictions}
        preds = pd.DataFrame(data=d)

        preds.to_csv('predictions.csv', index=False)
        return model.predict_proba(test_df)
        
    return model.predict_proba(X_test)

In [None]:
x = xgb(1, False)

In [None]:
#Random forrest approach 

def rf(seed, TO_TEST):
    seed = seed
    df2 = df
    df2 = df2.fillna(0)
    X_train, X_test, y_train, y_test = train_test_split(df2, dfLabels, test_size=0.7, random_state=seed)

    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(X_train, y_train)
    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                max_depth=10, max_features='auto', max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=1, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=27, n_jobs=1,
                oob_score=False, random_state=0, verbose=0, warm_start=False)

    # Use the forest's predict method on the test data
    y_pred = clf.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("RF - Seed: %i    -  Accuracy: %.2f%%" % (seed,accuracy * 100.0))
    
    if TO_TEST:
        test_df2 = test_df
        test_df2 = test_df2.fillna(0)
        
        #make predictions for train data
        y_pred2 = clf.predict(test_df2)
        predictions = [round(value) for value in y_pred2]

        d = {'id':test_df['id'], 'label':predictions}
        preds = pd.DataFrame(data=d)

        preds.to_csv('predictions.csv', index=False)
        return clf.predict_proba(test_df2)
    
    return (clf.predict_proba(X_test), y_test)


In [None]:
x = rf(1,False)

In [None]:
#Combine the prediction values of random forrest and xgboost 


def combine(seed, TO_TEST, rf_weight, xgb_weight):           
    if TO_TEST:
        x = rf(seed, TO_TEST)
    else:
        #Acquire training sample
        (x,y_test) = rf(seed, TO_TEST)
    rand_score = [b for a,b in x]
    
    xgb_score = xgb(seed, TO_TEST)
    xgb_score = [b for a,b in xgb_score]   

    predictions = []
    for i in range(0,len(rand_score)):
        val = rand_score[i]*rf_weight + xgb_score[i]*xgb_weight
        predictions.append(val)
        
    predictions = [int(round(value)) for value in predictions]
    
    if TO_TEST:
        d = {'id':test_df['id'], 'label':predictions}
        preds = pd.DataFrame(data=d)
        preds.to_csv('predictions.csv', index=False)
        
    else:
        accuracy = accuracy_score(y_test, predictions)
        print("Combine - Seed: %i    -  Accuracy: %.2f%%" % (seed,accuracy * 100.0))       
    
    
#seed, TO_TEST, rand forest weighting, xgboost weighting
#combine(1, True, 0.3, 0.7)



In [None]:
#UPDATE:  This was not helpful at all.  My approach of using two features (random forrest and xgb) led 
# to two different individual accuracies from these algorithms.  The combined 'prediciton' of the two was a linear
#relationship between the lower score and its algorithm's weighting and likewise for the higher scored algorithm
# ergo, this function is useless

def combine_findWeight(seed):           
    #Acquire training sample
    (x,y_test) = rf(seed, False)
    rand_score = [b for a,b in x]
    
    xgb_score = xgb(seed, False)
    xgb_score = [b for a,b in xgb_score]   

    print("SEED ------------ %i" % (seed))
    for rf_weight in range(1,100):
        rf_weight = float(rf_weight/100)
        xgb_weight = 1-rf_weight
        print(rf_weight)
        print(xgb_weight)
        predictions = []
        for i in range(0,len(rand_score)):
            val = rand_score[i]*rf_weight + xgb_score[i]*xgb_weight
            predictions.append(val)
        predictions = [int(round(value)) for value in predictions]
        accuracy = accuracy_score(y_test, predictions)
        print("rf . xgb = %f . %f  -  Accuracy: %.2f%%" % (rf_weight, xgb_weight, accuracy * 100.0))  
        print()

    
#seed, TO_TEST, rand forest weighting, xgboost weighting
combine_findWeight(1)
#for i in range(0,20):
    #combine_findWeight(i)



In [4]:
#XGBoost approach 

def xgbParam(seed_in, TO_TEST,depth=4,eta=0.01,n=120):
    # split data into train and test sets
    seed = seed_in
    test_size = 0.7
    X_train, X_test, y_train, y_test = train_test_split(df, dfLabels, test_size=test_size, random_state=seed)

    # fit model no training data
    model = XGBClassifier(depth = depth, eta = eta, n_estimators = n)
    model.fit(X_train, y_train)
    
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("xgb - Seed: %i    -  Accuracy: %.2f%%" % (seed,accuracy * 100.0))
    
    if TO_TEST:
        #make predictions for train data
        y_pred = model.predict(test_df)
        predictions = [round(value) for value in y_pred]

        d = {'id':test_df['id'], 'label':predictions}
        preds = pd.DataFrame(data=d)

        preds.to_csv('predictions.csv', index=False)
        return model.predict_proba(test_df)
        
    return model.predict_proba(X_test)

In [12]:
xgbParam(1,False,depth=7,eta=0.001,n=1000)

  if diff:


xgb - Seed: 1    -  Accuracy: 97.00%


array([[2.6226044e-06, 9.9999738e-01],
       [5.8851838e-03, 9.9411482e-01],
       [4.3940824e-01, 5.6059176e-01],
       ...,
       [7.4755549e-03, 9.9252445e-01],
       [4.0131211e-03, 9.9598688e-01],
       [5.3048134e-05, 9.9994695e-01]], dtype=float32)

In [None]:
96.99 - all
96.28 - remove banned_ip
97.00 - remove risk
- remove both


1- 2000
2- 1400
3- 900
4- 450