In [1]:
# import libraries
import pandas as pd
import numpy as np
import os
import re

import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

# Feature Engineering
from sklearn.model_selection import train_test_split

# soundtrack - 
##############
# Inspiration (Theme) : Relax, inc.
##############
# Snakes & Ladders - Single - Mzuki & Himalia
# Situations - Escape This - Himalia
# Transitions - World Alight (feat. Rhea) - Himalia
# Kingdom (feat. Sakima) [Barefoot Remix] = Himalia
# Situations - Do You Wanna (feat. Laurelle) - Himalia
##############

# Where to save the figures (I adapted this from Aurelien Geron's code will fill in rest later)
PROJECT_ROOT_DIR = "."
PROJECT_ID = "relax"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", PROJECT_ID)
#MODEL_PATH = os.path.join(PROJECT_ROOT_DIR, 'models', PROJECT_ID)
#$os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=72):
    """ 
    resolution quality
    300 high 
    150 medium
    72 low
    """
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
# read in table, set encoding to latin-1 
# see https://stackoverflow.com/questions/5552555/unicodedecodeerror-invalid-continuation-byte 

users = pd.read_csv('./takehome_users.csv', encoding='latin-1')
user_engagement = pd.read_csv('./takehome_user_engagement.csv')

table 1 "takehome_users" - 12,000 users who signed up for the product in the last two years:
    
    ● name: the user's name
    ● object_id: the user's id
    ● email: email address
    ● creation_source: how their account was created. This takes on one of 5 values:
        ○ PERSONAL_PROJECTS: invited to join another user's personal workspace
        ○ GUEST_INVITE: invited to an organization as a guest (limited permissions)
        ○ ORG_INVITE: invited to an organization (as a full member)
        ○ SIGNUP: signed up via the website
        ○ SIGNUP_GOOGLE_AUTH: signed up using Google Authentication (using a Google email account for their login id)
    ● creation_time: when they created their account
    ● last_session_creation_time: unix timestamp of last login
    ● opted_in_to_mailing_list: whether they have opted into receiving marketing emails
    ● enabled_for_marketing_drip: whether they are on the regular marketing email drip
    ● org_id: the organization (group of users) they belong to
    ● invited_by_user_id: which user invited them to join (if applicable).
    ● is_adopted: target feature created by calling function is_adopted() on object_id (done here)
    
table 2 A usage summary table ("takehome_user_engagement") that has a row for each day that a user logged into the product.

Task Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven-day period, identify which factors predict future user adoption.

### Cleanup/Data Normalization

In [3]:
# User engagement first

user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'], errors='raise', infer_datetime_format=True)
user_engagement.dtypes

time_stamp    datetime64[ns]
user_id                int64
visited                int64
dtype: object

In [4]:
user_engagement.isna().sum()

time_stamp    0
user_id       0
visited       0
dtype: int64

In [5]:
# save oldest date for login activity
today = user_engagement['time_stamp'].max()

In [6]:
user_engagement.shape

(207917, 3)

In [7]:
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [8]:
# number of user ids
user_engagement['user_id'].nunique()

8823

In [9]:
# Users next

users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [10]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [11]:

users['creation_source'][users['last_session_creation_time'].isna()].value_counts()

PERSONAL_PROJECTS    1347
ORG_INVITE           1066
GUEST_INVITE          575
SIGNUP                189
Name: creation_source, dtype: int64

In [12]:
users['creation_source'].value_counts()

ORG_INVITE            4254
GUEST_INVITE          2163
PERSONAL_PROJECTS     2111
SIGNUP                2087
SIGNUP_GOOGLE_AUTH    1385
Name: creation_source, dtype: int64

In [13]:
# Cool, this was by accident but I'm resusing this for looking at percentages of missing values in a feature
users.isna().sum() / len(users)

object_id                     0.00000
creation_time                 0.00000
name                          0.00000
email                         0.00000
creation_source               0.00000
last_session_creation_time    0.26475
opted_in_to_mailing_list      0.00000
enabled_for_marketing_drip    0.00000
org_id                        0.00000
invited_by_user_id            0.46525
dtype: float64

Cleaning 

In [14]:
users['creation_time'] = pd.to_datetime(users['creation_time'], errors='raise', infer_datetime_format=True)
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'], errors='raise', unit='s')
users['invited_by_user_id'] = users['invited_by_user_id'].fillna(value=0).astype(int)

In [15]:
users

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240
...,...,...,...,...,...,...,...,...,...,...
11995,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,2013-09-06 06:14:15,0,0,89,8263
11996,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,2013-01-15 18:28:37,0,0,200,0
11997,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,2014-04-27 12:45:16,1,1,83,8074
11998,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,2012-06-02 11:55:59,0,0,6,0


In [16]:
# drop na for now
# users_na = users.dropna()

# Clean up columns
#users_na['creation_time'] = pd.to_datetime(users_na['creation_time'], errors='raise', infer_datetime_format=True)
#users_na['last_session_creation_time'] = pd.to_datetime(users_na['last_session_creation_time'], errors='raise', unit='s')
#users_na['invited_by_user_id'] = users_na['invited_by_user_id'].astype(int)

In [17]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   object_id                   12000 non-null  int64         
 1   creation_time               12000 non-null  datetime64[ns]
 2   name                        12000 non-null  object        
 3   email                       12000 non-null  object        
 4   creation_source             12000 non-null  object        
 5   last_session_creation_time  8823 non-null   datetime64[ns]
 6   opted_in_to_mailing_list    12000 non-null  int64         
 7   enabled_for_marketing_drip  12000 non-null  int64         
 8   org_id                      12000 non-null  int64         
 9   invited_by_user_id          12000 non-null  int64         
dtypes: datetime64[ns](2), int64(5), object(3)
memory usage: 937.6+ KB


## Target Feature 
Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven-day period, identify which factors predict future user adoption.

In [18]:
def adopted_user(user_id):
    
    """
    accepts user_id as an argument of type int. 
    returns true if in the span of the user's history
    if visits <= 3 for a period in the weekly frequency, it checks the span of that week in daily frequencies to check it each login occurs on an individual day in that week
    if all is true then user is given status of "adopted user" bool:True
    if visits <= 3, but there is more than one visit on a given day, move on to next date.
    If nothing found return bool False
    """
    # get user id history
    user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'], errors='raise', infer_datetime_format=True)
    history = user_engagement[user_engagement['user_id'] == user_id]
    #assert history['time_stamp'] != 'Inde
    # for debugging
    #history
    
    # set bool check
    is_good = True
    
    # downsample
    weekly = history.set_index('time_stamp').resample(rule='w').sum()
    daily = history.set_index('time_stamp').resample(rule='d').sum()
    
    # For each week over the number of total weeks
    for i in range(len(weekly)):
        
        # if there are three or more visits in a given week
        if weekly.iloc[i][1] >= 3:
        
        # Get the start day and end day of that week
            good_week_start = weekly.iloc[i].name
            good_week_end = good_week_start + pd.Timedelta('7 days')
        
        # Get the daily sample of the span of that week
            week = daily.loc[good_week_start:good_week_end]
        
        # For each day in that given week
            for j in range(len(week)):
            
            # if there is a day that has more than 1 visit (i.e. not distinct days)
            # CHECK THIS!! SOMETING FEELS WRONG!!
                if week.iloc[j][1] > 1:
                
                # Set off a flag
                    is_good = False
                # if that week met the criterion, return true
            if is_good == True:
                return True
        
            # Else reset the flag
            else:
                is_good = True
# at the end there was nothing
    return False

In [19]:
# Important!
users.set_index('object_id', inplace=True)

In [20]:
%%time
# Generate Target
# Takes ~ two minutes 
users['is_adopted'] = [adopted_user(user) for user in users.index]

CPU times: user 4min 43s, sys: 4.69 s, total: 4min 48s
Wall time: 5min 5s


In [21]:
# Save my work
#pd.to_pickle(users, './users_target.pkl')

## Set Aside Test Data
Need to check class imbalance to determine how to split the data

In [22]:
dataset = users.copy(deep=True)
X, y = dataset.drop('is_adopted', axis=1), dataset['is_adopted']

In [23]:
# 80/20 split leaves 289 of the minor class for the test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.8, stratify=y)
y_test.value_counts()

False    2111
True      289
Name: is_adopted, dtype: int64

In [24]:
# Save Test Data And No Peeking!
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

#pd.to_pickle(train, './train.pkl')
#pd.to_pickle(test, './test.pkl')

## Feature Engineering

In [25]:
# Break down strings

def decompose_email(address):
    user, domain = address.split('@')
    domain_name, ext = domain.split('.')
    return user, domain_name, ext

# Check is user_id has any red flags
def flag_user(user_id):
    flagged_id = train.loc[user_id]
    full_name = flagged_id['name'].split()
    username = flagged_id['username']
    
    for name in full_name:
        #print(name)
        if re.search(name, username):
            return 0
    return 1

def flag_domain(user_id):
    flagged_id =  train.loc[user_id]
    domain = flagged_id['domain']
    if domain =='flag':
        return 1
    return 0

In [26]:
# Feature Engineering

# Decomponose Emails Addresses

username, domain, extension = [], [], []
for i in range(len(train['email'])):
    address = decompose_email(train['email'].iloc[i])
    username.append(address[0])
    domain.append(address[1])
    extension.append(address[2])

train['username'] = username
train['domain'] = domain
train['extention'] = extension

# check for sketchy domains
valid_domains = train['domain'].value_counts()[:6].index
train['domain'] = train['domain'].apply(lambda x: x if x in valid_domains else 'flag')

# Suspicious emails
flags = []
for user_id in train.index:
    flags.append(flag_user(user_id))
 
# Create flags
train['flags_username'] = flags
train['flags_domain'] = train['domain'].apply(lambda x: 1 if x =='flag' else 0)
train['last_seen_active'] = today - train['last_session_creation_time']
train['account_age'] = today - train['creation_time']
#train[train['last_seen_active'].isna()]

#train['account_age_days'] = train['account_age'].dt.days.fillna(0.0).astype(int)

#days_old_flag = 550.0
#ratio = train['flags'][train['account_age_days'] >=days_old_flag].value_counts()[1] / train['flags'][train['account_age_days'] >=days_old_flag].value_counts()[0]

#print('flag to non-flag ratio {:.2f}'.format(ratio))

In [27]:
inactive_users = train[train['last_seen_active'].isnull()] 
active_users = train[train['last_seen_active'].notnull()] 

In [28]:
print('percentage of flags for inactive users')
print('flags_username', inactive_users['flags_username'].value_counts()[1] / inactive_users['flags_username'].value_counts()[0])
print('flags_domain', inactive_users['flags_domain'].value_counts()[1] / inactive_users['flags_domain'].value_counts()[0])


percentage of flags for inactive users
flags_username 0.13622152746761948
flags_domain 0.13470115967885815


In [29]:
print('percentage of flags for inactive users')
print('flags_username', active_users['flags_username'].value_counts()[1] / active_users['flags_username'].value_counts()[0])
print('flags_domain', active_users['flags_domain'].value_counts()[1] / active_users['flags_domain'].value_counts()[0])


percentage of flags for inactive users
flags_username 0.10198344526003436
flags_domain 0.10181136789506559


There is a ~5% increase in the ratio of flags for inative users to active ones for username, and a 3% increase from domains

In [30]:
inactive = train.loc[inactive_users.index]
active = train.loc[active_users.index]
inactive['last_seen_active'] = inactive['account_age'] 
inactive['flags_inactivity'] = 1
active['flags_inactivity'] = 0

train_new = pd.concat([active, inactive])
train_new['last_seen_active'] = train_new['last_seen_active'].dt.days
train_new['account_age'] = train_new['account_age'].dt.days

In [31]:
features = [
    'creation_source', 
              'opted_in_to_mailing_list', 
              'enabled_for_marketing_drip', 
              'org_id', 
              'invited_by_user_id', 
              'account_age',
              'last_seen_active',
              'flags_username', 
              'flags_domain',
              'flags_inactivity',
]

In [32]:
train_new[features].groupby('creation_source').sum()

Unnamed: 0_level_0,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,account_age,last_seen_active,flags_username,flags_domain,flags_inactivity
creation_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GUEST_INVITE,422,252,241473,10051543,562866,467769,196,195,460
ORG_INVITE,877,522,483021,20850003,1106205,965063,382,379,878
PERSONAL_PROJECTS,394,236,239985,0,558991,518728,190,190,1054
SIGNUP,417,253,238319,0,518396,443488,190,190,152
SIGNUP_GOOGLE_AUTH,275,170,156425,0,358873,294294,0,0,0


In [33]:
flags_total = train_new[features].groupby('org_id').sum()[['flags_username', 'flags_domain', 'flags_inactivity']].sum(axis=1)
org_id_flags = train_new[features].groupby('org_id').sum().sort_values(by='flags_inactivity', ascending=False)

# sum all flags
org_id_flags['flags_total'] = flags_total

# Using a threshold of 20.0 total flags for cutoff (8.6% of total organizations)
flagged_organizations = org_id_flags[org_id_flags.sort_values(by='flags_total', ascending=False)['flags_total'] < 20.0]
bad_orgs = flagged_organizations.index


  flagged_organizations = org_id_flags[org_id_flags.sort_values(by='flags_total', ascending=False)['flags_total'] < 20.0]


In [34]:
train_new['flags_org'] = 0
train_new['flags_org'] = train_new['org_id'].apply(lambda x:1 if x in bad_orgs else 0)
features.append('flags_org')
top_spammers = train_new[features]['invited_by_user_id'].value_counts()[train_new[features]['invited_by_user_id'].value_counts() > 5].index
train_new['flags_spammers'] = 0
features.append('flags_spammers')

In [35]:


train_new[features]['invited_by_user_id'].value_counts()

0        4439
2527       11
10481      10
10741      10
1525       10
         ... 
11285       1
5247        1
6966        1
4769        1
4290        1
Name: invited_by_user_id, Length: 2318, dtype: int64

In [36]:
for user_id in train_new.index:
    flagged_id = train_new.loc[user_id]
    org_id = flagged_id['org_id']
    if (user_id in top_spammers )| (org_id in top_spammers):
            train_new['flags_spammers'].loc[user_id] = 1
            
features.remove('invited_by_user_id')
features.remove('creation_source')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [37]:
features = ['creation_time', 'name', 'email', 'creation_source',
       'last_session_creation_time', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id',
       'is_adopted', 'username', 'domain', 'extention', 'flags_username',
       'flags_domain', 'last_seen_active', 'account_age', 'flags_inactivity',
       'flags_org', 'flags_spammers']

In [38]:
 org_id_flags[org_id_flags.sort_values(by='flags_total', ascending=False)['flags_total'] < 20.0]

  org_id_flags[org_id_flags.sort_values(by='flags_total', ascending=False)['flags_total'] < 20.0]


Unnamed: 0_level_0,opted_in_to_mailing_list,enabled_for_marketing_drip,invited_by_user_id,account_age,last_seen_active,flags_username,flags_domain,flags_inactivity,flags_total
org_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
80,8,5,111982,9544,8763,3,3,12,18
49,8,5,90799,9964,8288,1,1,12,14
47,8,7,59099,11184,10161,2,2,12,16
95,12,6,133880,9146,7305,3,3,12,18
17,11,5,185472,18114,17158,2,2,12,16
...,...,...,...,...,...,...,...,...,...
150,3,2,29725,3079,2317,2,2,0,4
319,3,0,57128,2339,1832,0,0,0,0
354,1,2,18403,1489,1174,0,0,0,0
375,2,2,91540,4993,4441,3,3,0,6


## Modeling

In [39]:
features = ['opted_in_to_mailing_list',
 'enabled_for_marketing_drip',
 'org_id',
 'account_age',
 'last_seen_active',
 'flags_username',
 'flags_domain',
 'flags_inactivity',
 'flags_org',
 'flags_spammers']

In [40]:
X = pd.concat([train_new[features], pd.get_dummies(train_new['creation_source'])], axis=1)

In [41]:
y = train['is_adopted'].astype(int)

In [42]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, stratify=y, random_state=0)

In [43]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.pipeline import make_pipeline

In [44]:
#rf_pipe.get_params()

#### Random Forest

In [45]:
%%time
rf_pipe = make_pipeline(StandardScaler(), RandomForestClassifier(n_jobs=-1, class_weight='balanced', random_state=0, verbose=0))

params_grid = {
 'randomforestclassifier__max_depth': [None, 2, 20, 200],
 'randomforestclassifier__max_leaf_nodes': [None, 1, 5, 10],
 'randomforestclassifier__max_samples': [None, 2, 10, 20],
 'randomforestclassifier__min_samples_leaf': [1, 4, 10],
 'randomforestclassifier__min_samples_split': [2, 5, 10],
 'randomforestclassifier__min_weight_fraction_leaf': [0.0, 0.2, 0.4],
 'randomforestclassifier__n_estimators': [10, 100, 500, 1000],
}

rf_rcv = RandomizedSearchCV(rf_pipe, param_distributions=params_grid, scoring = 'balanced_accuracy', n_jobs=-1, return_train_score=True, cv=5, n_iter=15, verbose=1)
rf_rcv.fit(X_train, y_train)
results = pd.DataFrame(rf_rcv.cv_results_)[['mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by='rank_test_score')
results

Fitting 5 folds for each of 15 candidates, totalling 75 fits


5 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 442, in fit
    trees = Parallel(
  File "/Library/Frameworks/Python.framewo

CPU times: user 1.92 s, sys: 494 ms, total: 2.41 s
Wall time: 47.4 s


Unnamed: 0,mean_train_score,mean_test_score,rank_test_score
0,0.5,0.5,1
1,0.5,0.5,1
2,0.5,0.5,1
3,0.5,0.5,1
4,0.5,0.5,1
5,0.5,0.5,1
6,0.5,0.5,1
8,0.5,0.5,1
9,0.5,0.5,1
10,0.5,0.5,1


In [46]:
# results
y_pred = rf_rcv.predict(X_val)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.88      1.00      0.94      1689
           1       0.00      0.00      0.00       231

    accuracy                           0.88      1920
   macro avg       0.44      0.50      0.47      1920
weighted avg       0.77      0.88      0.82      1920



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [47]:
%%time
lgr_pipe = make_pipeline(StandardScaler(), LogisticRegression(n_jobs=-1, class_weight='balanced', random_state=0, verbose=0))
lgr_cv = cross_validate(lgr_pipe, X_train, y_train, cv=5, return_train_score=True, scoring='balanced_accuracy')

CPU times: user 281 ms, sys: 47.8 ms, total: 328 ms
Wall time: 309 ms


In [48]:
lgr_pipe.get_params()

{'memory': None,
 'steps': [('standardscaler', StandardScaler()),
  ('logisticregression',
   LogisticRegression(class_weight='balanced', n_jobs=-1, random_state=0))],
 'verbose': False,
 'standardscaler': StandardScaler(),
 'logisticregression': LogisticRegression(class_weight='balanced', n_jobs=-1, random_state=0),
 'standardscaler__copy': True,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True,
 'logisticregression__C': 1.0,
 'logisticregression__class_weight': 'balanced',
 'logisticregression__dual': False,
 'logisticregression__fit_intercept': True,
 'logisticregression__intercept_scaling': 1,
 'logisticregression__l1_ratio': None,
 'logisticregression__max_iter': 100,
 'logisticregression__multi_class': 'auto',
 'logisticregression__n_jobs': -1,
 'logisticregression__penalty': 'l2',
 'logisticregression__random_state': 0,
 'logisticregression__solver': 'lbfgs',
 'logisticregression__tol': 0.0001,
 'logisticregression__verbose': 0,
 'logisticregression__warm_st

In [49]:
svm_penalties = np.logspace(-1,5,7)
svm_gammas = np.logspace(-6,-3,3)
svm_scalers = [MinMaxScaler()]
smv_kernels = ['rbf', 'poly', 'sigmoid', 'linear']

In [50]:
%%time
svm_pipe = make_pipeline(StandardScaler(), SVC( class_weight='balanced', random_state=0, verbose=3))

param_grid = { 
 'svc__C':svm_penalties,
 'svc__gamma': svm_gammas,
 'svc__kernel': ['sigmoid', 'rbf', 'poly'],
 'svc__degree':[2,3,4,5,6]
}

svm_rcv = RandomizedSearchCV(svm_pipe, 
                              param_distributions=param_grid, 
                              scoring='balanced_accuracy', 
                              return_train_score=True,
                                    random_state=0,
                            verbose=3
                             )
svm_rcv.fit(X_train, y_train)
results = pd.DataFrame(svm_rcv.cv_results_)[['mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by='rank_test_score')
results

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LibSVM][CV 1/5] END svc__C=1.0, svc__degree=4, svc__gamma=1e-06, svc__kernel=poly;, score=(train=0.500, test=0.500) total time=   2.1s
[LibSVM][CV 2/5] END svc__C=1.0, svc__degree=4, svc__gamma=1e-06, svc__kernel=poly;, score=(train=0.500, test=0.500) total time=   2.0s
[LibSVM][CV 3/5] END svc__C=1.0, svc__degree=4, svc__gamma=1e-06, svc__kernel=poly;, score=(train=0.500, test=0.500) total time=   2.1s
[LibSVM][CV 4/5] END svc__C=1.0, svc__degree=4, svc__gamma=1e-06, svc__kernel=poly;, score=(train=0.500, test=0.500) total time=   2.0s
[LibSVM][CV 5/5] END svc__C=1.0, svc__degree=4, svc__gamma=1e-06, svc__kernel=poly;, score=(train=0.500, test=0.500) total time=   1.9s
[LibSVM][CV 1/5] END svc__C=100000.0, svc__degree=6, svc__gamma=0.001, svc__kernel=poly;, score=(train=0.500, test=0.500) total time=   3.0s
[LibSVM][CV 2/5] END svc__C=100000.0, svc__degree=6, svc__gamma=0.001, svc__kernel=poly;, score=(train=0.500, test=0.5

Unnamed: 0,mean_train_score,mean_test_score,rank_test_score
5,0.54308,0.503069,1
0,0.5,0.5,2
1,0.5,0.5,2
3,0.5,0.5,2
4,0.5,0.5,2
6,0.5,0.5,2
7,0.5,0.5,2
8,0.5,0.5,2
9,0.522228,0.49144,9
2,0.523765,0.490056,10


In [53]:
%%time
gb_pipe = make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=0, verbose=0))
gb_cv = cross_validate(gb_pipe, X_train, y_train, cv=5, return_train_score=True, scoring='balanced_accuracy')

CPU times: user 3.28 s, sys: 81 ms, total: 3.36 s
Wall time: 3.73 s


In [54]:
gb_cv

{'fit_time': array([0.62946892, 0.77502203, 0.67113209, 0.81107521, 0.73235297]),
 'score_time': array([0.00744009, 0.00461698, 0.00441885, 0.00784588, 0.00494385]),
 'test_score': array([0.50159241, 0.5       , 0.49888971, 0.50196251, 0.50429512]),
 'train_score': array([0.50878378, 0.51013514, 0.50810811, 0.51081081, 0.50945946])}

In [56]:
# Looks like there are some smaps organizations