In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
people = pd.read_csv('people.csv.zip')
act_train = pd.read_csv('act_train.csv.zip')
act_test = pd.read_csv('act_test.csv.zip')

In [3]:
submission_ids = act_test['activity_id']

In [4]:
people.columns

Index(['people_id', 'char_1', 'group_1', 'char_2', 'date', 'char_3', 'char_4',
       'char_5', 'char_6', 'char_7', 'char_8', 'char_9', 'char_10', 'char_11',
       'char_12', 'char_13', 'char_14', 'char_15', 'char_16', 'char_17',
       'char_18', 'char_19', 'char_20', 'char_21', 'char_22', 'char_23',
       'char_24', 'char_25', 'char_26', 'char_27', 'char_28', 'char_29',
       'char_30', 'char_31', 'char_32', 'char_33', 'char_34', 'char_35',
       'char_36', 'char_37', 'char_38'],
      dtype='object')

In [5]:
people_columns = ['char_1', 'group_1', 'char_2', 'date', 'char_3', 'char_4',
       'char_5', 'char_6', 'char_7', 'char_8', 'char_9', 'char_10', 'char_11',
       'char_12', 'char_13', 'char_14', 'char_15', 'char_16', 'char_17',
       'char_18', 'char_19', 'char_20', 'char_21', 'char_22', 'char_23',
       'char_24', 'char_25', 'char_26', 'char_27', 'char_28', 'char_29',
       'char_30', 'char_31', 'char_32', 'char_33', 'char_34', 'char_35',
       'char_36', 'char_37']

print('People DataFrame')
for c in people_columns:
    print(f'Number of unique values for {c:10}:{people[c].nunique()}')

People DataFrame
Number of unique values for char_1    :2
Number of unique values for group_1   :34224
Number of unique values for char_2    :3
Number of unique values for date      :1196
Number of unique values for char_3    :43
Number of unique values for char_4    :25
Number of unique values for char_5    :9
Number of unique values for char_6    :7
Number of unique values for char_7    :25
Number of unique values for char_8    :8
Number of unique values for char_9    :9
Number of unique values for char_10   :2
Number of unique values for char_11   :2
Number of unique values for char_12   :2
Number of unique values for char_13   :2
Number of unique values for char_14   :2
Number of unique values for char_15   :2
Number of unique values for char_16   :2
Number of unique values for char_17   :2
Number of unique values for char_18   :2
Number of unique values for char_19   :2
Number of unique values for char_20   :2
Number of unique values for char_21   :2
Number of unique values for ch

In [6]:
act_train.columns

Index(['people_id', 'activity_id', 'date', 'activity_category', 'char_1',
       'char_2', 'char_3', 'char_4', 'char_5', 'char_6', 'char_7', 'char_8',
       'char_9', 'char_10', 'outcome'],
      dtype='object')

In [7]:
act_columns = ['date', 'activity_category', 'char_1',
       'char_2', 'char_3', 'char_4', 'char_5', 'char_6', 'char_7', 'char_8',
       'char_9', 'char_10', 'outcome']

print('Action DataFrame')
for c in act_columns:
    print(f'Number of unique values for {c:20}:{act_train[c].nunique()}')

Action DataFrame
Number of unique values for date                :411
Number of unique values for activity_category   :7
Number of unique values for char_1              :51
Number of unique values for char_2              :32
Number of unique values for char_3              :11
Number of unique values for char_4              :7
Number of unique values for char_5              :7
Number of unique values for char_6              :5
Number of unique values for char_7              :8
Number of unique values for char_8              :18
Number of unique values for char_9              :19
Number of unique values for char_10             :6515
Number of unique values for outcome             :2


In [8]:
labels = act_train['outcome']

In [9]:
labels.value_counts()

0    1221794
1     975497
Name: outcome, dtype: int64

In [75]:
# Defining functions for preprocessing the Data

def preprocessing_actions(df, train_data=True):
    #removing the activity_id
    df.drop('activity_id', axis=1, inplace=True)
    
    #the outcome shall be stored prior to preprocessing to be used in model training step
    if (train_data):
        df.drop('outcome', axis=1, inplace=True)
    
    #simplifying the people_id column
    df['people_id'] = df['people_id'].apply(lambda x: x.split('_')[1])
    df['people_id'] = pd.to_numeric(df['people_id']).astype(int)
    
    #extracting the year from the date
    df['date'] = pd.to_datetime(df['date'])
    df['year_action'] = df['date'].apply(lambda x: x.year)
    df.drop('date', axis=1, inplace=True)
    
    columns = list(df.columns)
    columns = [c for c in columns if c not in ('year_action','people_id')]
    
    for c in columns:
        df[c] = df[c].fillna('type 0')
        df[c] = df[c].apply(lambda x: x.split(' ')[1])
        df[c] = pd.to_numeric(df[c]).astype(int)
    return df


def preprocessing_people(df):
    
    #simplifying the people_id column
    df['people_id'] = df['people_id'].apply(lambda x: x.split('_')[1])
    df['people_id'] = pd.to_numeric(df['people_id']).astype(int)
    
    #extracting the year from the date
    df['date'] = pd.to_datetime(df['date'])
    df['year_people'] = df['date'].apply(lambda x: x.year)
    df.drop('date', axis=1, inplace=True)
    
    columns = list(df.columns)
    strings = columns[1:11]
    bools = columns[11:39]
    
    for c in bools:
        df[c] = pd.to_numeric(df[c]).astype(int)
    
    for s in strings:
        df[s] = df[s].fillna('type 0')
        df[s] = df[s].apply(lambda x: x.split(' ')[1])
        df[s] = pd.to_numeric(df[s]).astype(int)
    return df

In [76]:
people_processed = preprocessing_people(people)
act_train_processed = preprocessing_actions(act_train)
act_test_processed = preprocessing_actions(act_test, train_data=False)

In [77]:
#merging the DataFrames for model training

train = pd.merge(act_train_processed, people_processed, on='people_id')
test = pd.merge(act_test_processed, people_processed, on='people_id')

In [78]:
train.head()

Unnamed: 0,people_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,...,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,year_people
0,100,4,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
1,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
2,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
3,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
4,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021


In [79]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2197291 entries, 0 to 2197290
Data columns (total 53 columns):
people_id            int32
activity_category    int32
char_1_x             int32
char_2_x             int32
char_3_x             int32
char_4_x             int32
char_5_x             int32
char_6_x             int32
char_7_x             int32
char_8_x             int32
char_9_x             int32
char_10_x            int32
year_action          int64
char_1_y             int32
group_1              int32
char_2_y             int32
char_3_y             int32
char_4_y             int32
char_5_y             int32
char_6_y             int32
char_7_y             int32
char_8_y             int32
char_9_y             int32
char_10_y            int32
char_11              int32
char_12              int32
char_13              int32
char_14              int32
char_15              int32
char_16              int32
char_17              int32
char_18              int32
char_19              in

In [80]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.2, random_state=42)

In [81]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [84]:
test_predictions = rfc.predict_proba(X_test)[:,1]

In [88]:
test_predictions[0:50]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 1.        , 0.        , 1.        , 0.17356942,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.43327468, 0.        , 0.        , 0.        , 1.        ,
       0.        , 1.        , 0.        , 0.8       , 0.        ,
       1.        , 1.        , 1.        , 0.8       , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ])

In [90]:
from sklearn.metrics import accuracy_score, roc_auc_score
score = roc_auc_score(y_test, test_predictions)

In [91]:
score

0.9976002769978195

In [94]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

In [93]:
clf2 = RandomForestClassifier()

In [95]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=42)

In [96]:
scores = cross_val_score(clf2, train, labels, cv=cv)





In [97]:
print(f'scores mean: {scores.mean()}, scores std: {scores.std()}')

scores mean: 0.9828844701145889, scores std: 0.00027585324026966826


In [98]:
train_copy = train.copy()

In [99]:
train.head()

Unnamed: 0,people_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,...,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,year_people
0,100,4,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
1,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
2,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
3,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
4,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021


In [100]:
from sklearn.decomposition import PCA

In [101]:
pca = PCA(n_components=15, random_state=42)

In [108]:
pca_output = pca.fit_transform(train_copy)

In [106]:
train_copy.head()

Unnamed: 0,people_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,char_8_x,...,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,year_people
0,100,4,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
1,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
2,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
3,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021
4,100,2,0,0,0,0,0,0,0,0,...,1,1,0,0,1,1,1,0,36,2021


In [103]:
pca.explained_variance_ratio_

array([9.92841698e-01, 7.05790800e-03, 1.00271333e-04, 9.27456339e-08,
       2.34466579e-08, 3.07768620e-09, 1.19817937e-09, 8.38230305e-10,
       3.79217335e-10, 1.81489345e-10, 1.69118787e-10, 1.59029852e-10,
       1.10914270e-10, 8.37171092e-11, 7.91084744e-11])

In [111]:
columns1 = []
for i in range(1,16):
    name = str('PCA'+str(i))
    columns1.append(name)

In [112]:
columns1

['PCA1',
 'PCA2',
 'PCA3',
 'PCA4',
 'PCA5',
 'PCA6',
 'PCA7',
 'PCA8',
 'PCA9',
 'PCA10',
 'PCA11',
 'PCA12',
 'PCA13',
 'PCA14',
 'PCA15']

In [119]:
train_pca = pd.DataFrame(data=pca_output, columns=columns1)

In [120]:
train_pca.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14,PCA15
0,202281.531308,1767.324456,-415.305491,-13.706327,-17.967382,0.208457,-1.486449,-0.147715,-2.309344,0.24256,0.974003,-1.031512,0.004406,-0.002873,-0.423428
1,202281.523246,1767.246411,-490.306464,-13.571537,-17.963188,0.227499,-1.29377,-0.147442,-2.258142,0.074289,0.789912,-1.010286,0.022281,0.295432,1.163596
2,202281.523246,1767.246411,-490.306464,-13.571537,-17.963188,0.227499,-1.29377,-0.147442,-2.258142,0.074289,0.789912,-1.010286,0.022281,0.295432,1.163596
3,202281.523246,1767.24641,-490.306528,-13.570072,-17.965343,0.238962,-1.291745,-0.144606,-2.249785,0.063045,0.768677,-1.019273,0.029432,0.290461,1.222273
4,202281.523246,1767.24641,-490.306528,-13.570072,-17.965343,0.238962,-1.291745,-0.144606,-2.249785,0.063045,0.768677,-1.019273,0.029432,0.290461,1.222273


In [125]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_pca = pd.DataFrame(scaler.fit_transform(train_pca.values), columns=columns1)

In [126]:
train_pca.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,PCA11,PCA12,PCA13,PCA14,PCA15
0,1.744206,0.180742,-0.356337,-0.386684,-1.008152,0.032284,-0.368953,-0.043835,-1.018887,0.154695,0.643496,-0.702775,0.003594,-0.002698,-0.409024
1,1.744206,0.180734,-0.420689,-0.382881,-1.007916,0.035233,-0.321128,-0.043754,-0.996297,0.047379,0.521872,-0.688314,0.018177,0.277417,1.124016
2,1.744206,0.180734,-0.420689,-0.382881,-1.007916,0.035233,-0.321128,-0.043754,-0.996297,0.047379,0.521872,-0.688314,0.018177,0.277417,1.124016
3,1.744206,0.180734,-0.420689,-0.38284,-1.008037,0.037008,-0.320625,-0.042913,-0.992609,0.040207,0.507843,-0.694436,0.024011,0.272749,1.180697
4,1.744206,0.180734,-0.420689,-0.38284,-1.008037,0.037008,-0.320625,-0.042913,-0.992609,0.040207,0.507843,-0.694436,0.024011,0.272749,1.180697


In [127]:
X_train, X_test, y_train, y_test = train_test_split(train_pca, labels, test_size=0.2, random_state=42)
rfc3 = RandomForestClassifier()
rfc3.fit(X_train,y_train)
test_predictions3 = rfc3.predict_proba(X_test)[:,1]
score3 = roc_auc_score(y_test, test_predictions3)



In [128]:
score3

0.9963722846637973