In [51]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import datetime
from dateutil.parser import parse
from re import sub
from sklearn.model_selection import cross_validate

In [2]:
def handle_Xdata(data_df:pd.DataFrame):
    #handle money
    money_f = ["DisbursementGross","BalanceGross","GrAppv","SBA_Appv"]
    for f in money_f:
        data_df[f] = data_df[f].fillna("0") #don't exist actually
        value_list = []
        data_df[f] = list(map(lambda s: float(sub(r'[^\d.]', '', s)), data_df[f].values))
    
    # diff day
    diff_days = []
    for d1,d2 in zip(data_df.DisbursementDate, data_df.ApprovalDate):
        try:
            diff_days.append((parse(d1)-parse(d2)).days)
        except:
            diff_days.append(-100000)
    # datetime.datetime.strptime("16-Apr-10", '%d-%m-%Y').date()
    data_df['diff_days'] = diff_days
    mean_day = data_df[data_df['diff_days']!=-100000].diff_days.describe()['mean'] #use mean to fill nan

    diff_days = []
    for d1,d2 in zip(data_df.DisbursementDate, data_df.ApprovalDate):
        try:
            diff_days.append((parse(d1)-parse(d2)).days)
        except:
            diff_days.append(round(mean_day))
    data_df['diff_days'] = diff_days
    
    #drop 
    columns_to_drop = ["Id","Name","ApprovalDate", "DisbursementDate"]
    data_df = data_df.drop(columns=columns_to_drop, axis=1)
    
    #change type
    category_features = ['City', 'State',"Zip", 'Bank', 
                     'BankState',"NAICS", "FranchiseCode",
                     'ApprovalFY', 'RevLineCr', "UrbanRural",
                     "NewExist",'LowDoc']
    for f in category_features:
        data_df[f] = data_df[f].astype("category")
    return data_df

In [3]:
len(data.NAICS.value_counts())

NameError: name 'data' is not defined

In [35]:
data.BankState.value_counts()

CA    7257
NC    5590
IL    4029
OH    3072
SD    2955
RI    2679
TX    2451
VA    2363
NY    2148
DE    1565
UT    1032
MN     828
MO     769
WI     751
PA     736
FL     726
OR     677
GA     639
AL     637
MA     561
CT     514
IA     470
CO     463
WA     441
KS     426
NJ     414
SC     382
IN     351
OK     343
AR     328
MT     305
MI     300
MD     298
NH     271
MS     258
NE     254
TN     242
LA     239
ND     230
KY     220
NV     201
ID     195
VT     191
NM     188
DC     176
ME     175
AZ     161
HI     141
WV     105
WY      97
AK      70
PR       9
Name: BankState, dtype: int64

In [5]:
data = pd.read_csv("Xtrain.csv")
data_y = pd.read_csv("Ytrain.csv")

In [4]:
data.isnull().any()

Id                   False
Name                  True
City                 False
State                False
Zip                  False
Bank                  True
BankState             True
NAICS                False
ApprovalDate         False
ApprovalFY           False
Term                 False
NoEmp                False
NewExist              True
CreateJob            False
RetainedJob          False
FranchiseCode        False
UrbanRural           False
RevLineCr             True
LowDoc                True
DisbursementDate      True
DisbursementGross    False
BalanceGross         False
GrAppv               False
SBA_Appv             False
dtype: bool

In [6]:
data_X = handle_Xdata(data)
data_y = data_y.drop(['Id'],axis=1)

In [6]:
data_X.dtypes

City                 category
State                category
Zip                  category
Bank                 category
BankState            category
NAICS                category
ApprovalFY           category
Term                    int64
NoEmp                   int64
NewExist             category
CreateJob               int64
RetainedJob             int64
FranchiseCode        category
UrbanRural           category
RevLineCr            category
LowDoc               category
DisbursementGross     float64
BalanceGross          float64
GrAppv                float64
SBA_Appv              float64
diff_days               int64
dtype: object

In [36]:
category_features = ['City', 'State',"Zip", 'Bank', 
                     'BankState',"NAICS", "FranchiseCode",
                     'ApprovalFY', 'RevLineCr', "UrbanRural",
                     "NewExist",'LowDoc']
trainX,testX,trainY,testY = train_test_split(data_X,data_y,test_size=0.3,random_state=0)
train_data = lgb.Dataset(trainX,label=trainY,categorical_feature=category_features)
val_data = lgb.Dataset(testX,label=testY,categorical_feature=category_features, reference=train_data)

In [37]:
params = {'num_leaves': 128, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
#           "lambda_l1": 0.1,             #l1正则
          'lambda_l2': 0.001,     #l2正则
          "verbosity": -1,
          "nthread": -1,                #线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},  ##评价函数选择
          "random_state": 0, #随机数种子，可以防止每次运行的结果不一致
          }

In [49]:
a = lgb.LGBMClassifier(params=params)

In [50]:
cross_validate

TypeError: fit() missing 2 required positional arguments: 'X' and 'y'

In [38]:
res = lgb.train(params,train_set=train_data,
          valid_sets=[train_data,val_data], 
          num_boost_round=3000,
          verbose_eval=10,
          early_stopping_rounds=500)

Training until validation scores don't improve for 500 rounds
[10]	training's auc: 0.975773	training's binary_logloss: 0.523261	valid_1's auc: 0.969951	valid_1's binary_logloss: 0.528475
[20]	training's auc: 0.977566	training's binary_logloss: 0.422421	valid_1's auc: 0.971022	valid_1's binary_logloss: 0.43174
[30]	training's auc: 0.978996	training's binary_logloss: 0.352699	valid_1's auc: 0.971853	valid_1's binary_logloss: 0.365715
[40]	training's auc: 0.980187	training's binary_logloss: 0.303057	valid_1's auc: 0.972678	valid_1's binary_logloss: 0.319498
[50]	training's auc: 0.981426	training's binary_logloss: 0.266996	valid_1's auc: 0.97332	valid_1's binary_logloss: 0.286961
[60]	training's auc: 0.982665	training's binary_logloss: 0.241354	valid_1's auc: 0.973837	valid_1's binary_logloss: 0.264853
[70]	training's auc: 0.983702	training's binary_logloss: 0.220404	valid_1's auc: 0.974161	valid_1's binary_logloss: 0.247597
[80]	training's auc: 0.985019	training's binary_logloss: 0.204147

Early stopping, best iteration is:
[169]	training's auc: 0.993759	training's binary_logloss: 0.12662	valid_1's auc: 0.976551	valid_1's binary_logloss: 0.197037


In [10]:
Xtest = pd.read_csv("Xtest.csv")
input_data = handle_Xdata(Xtest)

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
def replace_v(v):
    if(v==1 or v==0):
        return v
    else:
        return 1
input_data['FranchiseCode'] = list(map(replace_v, input_data['FranchiseCode'].values))
input_data['FranchiseCode'] = input_data['FranchiseCode'].astype("category")

In [15]:
input_data.dtypes

City                 category
State                category
Zip                  category
Bank                 category
BankState            category
NAICS                category
ApprovalFY           category
Term                    int64
NoEmp                   int64
NewExist             category
CreateJob               int64
RetainedJob             int64
FranchiseCode        category
UrbanRural           category
RevLineCr            category
LowDoc               category
DisbursementGross     float64
BalanceGross          float64
GrAppv                float64
SBA_Appv              float64
diff_days               int64
dtype: object

In [43]:
dir(res)

['_Booster__attr',
 '_Booster__boost',
 '_Booster__get_eval_info',
 '_Booster__higher_better_inner_eval',
 '_Booster__init_predictor',
 '_Booster__inner_eval',
 '_Booster__inner_predict',
 '_Booster__inner_predict_buffer',
 '_Booster__is_predicted_cur_iter',
 '_Booster__name_inner_eval',
 '_Booster__need_reload_eval_info',
 '_Booster__num_class',
 '_Booster__num_dataset',
 '_Booster__num_inner_eval',
 '_Booster__set_objective_to_none',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_free_buffer',
 '_to_predictor',
 '_train_data_name',
 'add_valid',
 'attr',
 'best_iteration',
 'best_score',
 

In [48]:
res.feature_importance()

array([ 199,  825,   17,  564,  705,  610, 1015, 7507, 1199,  456,  361,
        686,   79,  595,  116,   11, 1567,    0,  927, 1745, 2279],
      dtype=int32)

In [45]:
res

<lightgbm.basic.Booster at 0x7fc6f65a74a8>

In [39]:
pred = res.predict(input_data)
THRESHOLD = 0.5 # to be tuned
pred_label = [0 if v<THRESHOLD else 1 for v in pred]
Id_list = [i for i in range(0,100000)]
submit_df = pd.DataFrame({"Id":Id_list,
                         "ChargeOff":pred_label})

In [40]:
submit_df

Unnamed: 0,Id,ChargeOff
0,0,1
1,1,0
2,2,1
3,3,0
4,4,1
...,...,...
99995,99995,0
99996,99996,1
99997,99997,0
99998,99998,0


In [41]:
submit_df.to_csv("submit_team666_gbdt128.csv",encoding='utf-8',index=False)

In [30]:
import os

In [31]:
os.getcwd()

'/Users/renxinyu/Documents/NUS/CS5228_DATA_MINING/Final Project/cs5228-2021s1_v1'