### Import

In [5]:
import numpy as np
import pandas as pd
import xgboost
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import *
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from IPython.core.display import Image 
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
import io
from sklearn.preprocessing import Imputer
import pydot
from sklearn import preprocessing
import lightgbm as lgb
from scipy.stats import mode
import re
from datetime import datetime
from lightgbm import plot_importance
import warnings
warnings.filterwarnings('ignore')

---

### Date read

In [1]:
age_gender_bkts = pd.read_csv("age_gender_bkts.csv")
countries = pd.read_csv("countries.csv")
sessions = pd.read_csv("sessions.csv")
test_users = pd.read_csv("test_users.csv")
train_users_2 = pd.read_csv("train_users_2.csv")
sample_submission_NDF = pd.read_csv("sample_submission_NDF.csv")

---

### Date setting

In [2]:
def pre_age_set_data():
    
    check = pd.concat([train_users_2, test_users], ignore_index=True)
    
    check["first_affiliate_tracked"] = check["first_affiliate_tracked"].replace(np.nan, "untracked")
    
    check["date_account_created"] = pd.to_datetime(check["date_account_created"], format = "%Y-%m-%d")
    check["timestamp_first_active"] = pd.to_datetime(check["timestamp_first_active"], format="%Y%m%d%H%M%S")

    s_lag = check["timestamp_first_active"] - check["date_account_created"]

    check["lag_days"] = s_lag.apply(lambda x : -1 * x.days)
    check["lag_seconds"] = s_lag.apply(lambda x : x.seconds)

    s_all_check = (check['age'] < 120) & (check['gender'] != '-unknown-')

    check['faithless_sign'] = s_all_check.apply(lambda x : 0 if x == True else 1)
    
    pre_age = check.drop("date_first_booking",axis = 1)
    
    pre_age['date_account_created_y'] = pre_age["date_account_created"].apply(lambda x : x.year)
    pre_age['date_account_created_m'] = pre_age["date_account_created"].apply(lambda x : x.month)
    pre_age['date_account_created_d'] = pre_age["date_account_created"].apply(lambda x : x.day)

    pre_age['timestamp_first_active_y'] = pre_age["timestamp_first_active"].apply(lambda x : x.year)
    pre_age['timestamp_first_active_m'] = pre_age["timestamp_first_active"].apply(lambda x : x.month)
    pre_age['timestamp_first_active_d'] = pre_age["timestamp_first_active"].apply(lambda x : x.day)

    pre_age = pre_age.drop("date_account_created" , axis=1)
    pre_age = pre_age.drop("timestamp_first_active" , axis=1)
    
    return check, pre_age

check, pre_age = pre_age_set_data()

---

# Gender

### Gender predict data set

In [3]:
def pre_gen_predict_data():

    pre_gen_sub = pre_age.filter(items = ['age', 'country_destination', 'id', 'gender'])
    pre_gen_dum = pre_age.filter(items = ['affiliate_channel', 'affiliate_provider',
                                       'first_affiliate_tracked', 'first_browser', 'first_device_type',
                                         'language', 'signup_app', 'signup_flow',
                                       'signup_method', 'date_account_created_y', 'date_account_created_m',
                                       'date_account_created_d', 'timestamp_first_active_y',
                                       'timestamp_first_active_m', 'timestamp_first_active_d',"lag_days","lag_seconds",
                                        "faithless_sign"])


    pre_gen_dum = pd.get_dummies(pre_gen_dum)
    pre_gen_dum_con = pd.concat([pre_gen_dum, pre_gen_sub], axis=1)
    pre_gen_dum_con["gender"] = pre_gen_dum_con["gender"].replace(['-unknown-', 'OTHER'], np.nan)

    pre_gen_mission = pre_gen_dum_con[pre_gen_dum_con["gender"].isna()].reset_index()
    pre_gen_train = pre_gen_dum_con[pre_gen_dum_con["gender"].notna()].reset_index()

    pre_gen_mission_test = pre_gen_mission.drop("index", axis=1)
    pre_gen_train_test = pre_gen_train.drop("index", axis=1)

    pre_gen_mission_test_drop = pre_gen_mission_test.drop(['id', 'age', 'country_destination', "gender"], axis=1)
    pre_gen_train_test_drop = pre_gen_train_test.drop(['id', 'age', 'country_destination', "gender"], axis=1)
    
    return pre_gen_mission_test, pre_gen_train_test, pre_gen_mission, pre_gen_train, \
            pre_gen_mission_test_drop, pre_gen_train_test_drop
    
pre_gen_mission_test, pre_gen_train_test, pre_gen_mission, pre_gen_train, \
            pre_gen_mission_test_drop, pre_gen_train_test_drop = pre_gen_predict_data()

### Gender predict LightGBM

In [6]:
def predict_gen_LightGBM():

    X = pre_gen_train_test_drop
    y = pre_gen_train_test["gender"]
    
    model_gen_lgb = lgb.LGBMClassifier(nthread=3)
    model_gen_lgb.fit(X,y)

    print(classification_report(y, model_gen_lgb.predict(pre_gen_train_test_drop)))
    model_gen_lgb = model_gen_lgb.predict(pre_gen_mission_test_drop)
    model_gen_lgb = pd.DataFrame(model_gen_lgb)
    
    return model_gen_lgb

model_gen_lgb = predict_gen_LightGBM()

             precision    recall  f1-score   support

     FEMALE       0.59      0.73      0.65     77524
       MALE       0.58      0.42      0.49     68209

avg / total       0.58      0.58      0.57    145733



### Gender predict data make CSV

In [7]:
model_gen_lgb.to_csv("model_gen_lgb.csv", index=False)

---

# Age

### Age predict data set

In [8]:
def pre_age_predict_data():
    
    pre_age['age'] = pre_age['age'].fillna(-1)
    
    pre_age_sub = pre_age.filter(items = ['age', 'country_destination','id'])
    pre_age_dum = pre_age.filter(items = ['affiliate_channel', 'affiliate_provider',
                                       'first_affiliate_tracked', 'first_browser', 'first_device_type',
                                       'language', 'signup_app', 'signup_flow',
                                       'signup_method', 'date_account_created_y', 'date_account_created_m',
                                       'date_account_created_d', 'timestamp_first_active_y',
                                       'timestamp_first_active_m', 'timestamp_first_active_d',"lag_days","lag_seconds",
                                        "faithless_sign"])
    
    pre_age_dum = pd.get_dummies(pre_age_dum)
    pre_age_dum_con = pd.concat([pre_age_dum, pre_age_sub], axis=1)
    pre_age_dum_con["age"] = pre_age_dum_con["age"].replace(-1, np.nan)
    
    pre_age_mission = pre_age_dum_con[pre_age_dum_con["age"].isna()].reset_index()
    pre_age_train = pre_age_dum_con[pre_age_dum_con["age"].notna()].reset_index()
    
    pre_age_mission_test = pre_age_mission.drop("index", axis=1)
    pre_age_train_test = pre_age_train.drop("index", axis=1)
    
    pre_age_mission_test_drop = pre_age_mission_test.drop(['id', 'age', 'country_destination'], axis=1)
    pre_age_train_test_drop = pre_age_train_test.drop(['id', 'age', 'country_destination'], axis=1)
    
    return pre_age_mission_test, pre_age_train_test, pre_age_mission, pre_age_train, \
            pre_age_mission_test_drop, pre_age_train_test_drop
    
pre_age_mission_test, pre_age_train_test, pre_age_mission, pre_age_train, \
            pre_age_mission_test_drop, pre_age_train_test_drop = pre_age_predict_data()

In [9]:
def pre_age_predict_data_cat():
    
    bins = [0, 15, 25, 35, 60, 9999]
    labels = ["미성년자", "청년", "중년", "장년", "노년"]
    cats = pd.cut(pre_age_train['age'], bins, labels=labels)
    cats = pd.DataFrame(cats)
    
    return cats

cats = pre_age_predict_data_cat()

### Age predict LightGBM

In [10]:
def predict_age_LightGBM():

    X = pre_age_train_test_drop
    y = cats
    
    model_age_lgb = lgb.LGBMClassifier(nthread=3)
    model_age_lgb.fit(X,y)

    print(classification_report(y, model_age_lgb.predict(pre_age_train_test_drop)))
    model_age_lgb = model_age_lgb.predict(pre_age_mission_test_drop)
    model_age_lgb = pd.DataFrame(model_age_lgb)
    
    return model_age_lgb

model_age_lgb = predict_age_LightGBM()

             precision    recall  f1-score   support

         노년       0.78      0.01      0.01      9993
       미성년자       0.98      0.74      0.84        68
         장년       0.49      0.39      0.43     55518
         중년       0.50      0.79      0.61     70900
         청년       0.56      0.04      0.07     22202

avg / total       0.52      0.50      0.44    158681



### Age predict data make CSV

In [11]:
model_age_lgb.to_csv("model_age_lgb.csv", index=False)

---

### Age (xgboost, ExtraTrees, LightGBM) predict

In [6]:
def predict_age_xgboost():
    
    X = pre_age_train_test_drop
    y = cats
    
    model_age_xg = XGBClassifier(nthread=3)
    model_age_xg.fit(X,y)
    
    print(classification_report(y, model_age_xg.predict(pre_age_train_test_drop)))
    model_age_xg = model_age_xg.predict(pre_age_mission_test_drop)
    model_age_xg = pd.DataFrame(model_age_xg)
    
    return model_age_xg

model_age_xg = predict_age_xgboost()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  if diff:
  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

         노년       1.00      0.00      0.00      9993
       미성년자       0.00      0.00      0.00        68
         장년       0.47      0.38      0.42     55518
         중년       0.49      0.79      0.61     70900
         청년       0.51      0.02      0.04     22202

avg / total       0.52      0.49      0.42    158681



  if diff:


In [7]:
def predict_age_ExtraTreesClassifier():
    
    X = pre_age_train_test_drop
    y = cats

    model_age_forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
    model_age_forest.fit(X, y)

    print(classification_report(y, model_age_forest.predict(pre_age_train_test_drop)))
    model_age_forest = model_age_forest.predict(pre_age_mission_test_drop)
    model_age_forest = pd.DataFrame(model_age_forest)
                         
    return model_age_forest

model_age_forest = predict_age_ExtraTreesClassifier()

  import sys


             precision    recall  f1-score   support

         노년       0.74      0.87      0.80      9993
       미성년자       0.91      0.91      0.91        68
         장년       0.80      0.89      0.84     55518
         중년       0.85      0.85      0.85     70900
         청년       0.91      0.60      0.72     22202

avg / total       0.84      0.83      0.83    158681



In [8]:
def predict_age_LightGBM():

    X = pre_age_train_test_drop
    y = cats
    
    model_age_lgb = lgb.LGBMClassifier(nthread=3)
    model_age_lgb.fit(X,y)

    print(classification_report(y, model_age_lgb.predict(pre_age_train_test_drop)))
    model_age_lgb = model_age_lgb.predict(pre_age_mission_test_drop)
    model_age_lgb = pd.DataFrame(model_age_lgb)
    
    return model_age_lgb

model_age_lgb = predict_age_LightGBM()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  if diff:


             precision    recall  f1-score   support

         노년       0.68      0.00      0.01      9993
       미성년자       0.89      0.57      0.70        68
         장년       0.48      0.41      0.44     55518
         중년       0.50      0.78      0.61     70900
         청년       0.57      0.04      0.08     22202

avg / total       0.51      0.50      0.44    158681



  if diff:


### Age (xgboost, ExtraTrees, LightGBM) predict data make CSV

In [9]:
model_age_xg.to_csv("model_age_xg.csv", index=False)
model_age_forest.to_csv("model_age_forest.csv", index=False)
model_age_lgb.to_csv("model_age_lgb.csv", index=False)

---