In [58]:
import pandas as pd  # 导入pandas库，用于数据处理和分析
import numpy as np  # 导入numpy库，用于科学计算和数值操作
from sklearn.metrics import roc_auc_score  # 从sklearn.metrics库中导入roc_auc_score函数，用于计算ROC-AUC指标

In [59]:
train = pd.read_csv('./task-data/train.csv')
test = pd.read_csv('./task-data/test.csv')

In [60]:
test_label = pd.read_csv('./task-data/ground_truth.csv')

In [61]:
train.head(2)

Unnamed: 0,id,loan_amnt,term,int_rate,installment,grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,default,purpose,title,addr_state
0,0,12000,36 months,11.53,395.89,B,Sales Operations Manager,< 1 year,RENT,85000.0,Source Verified,0,debt_consolidation,Debt consolidation,CO
1,1,16000,60 months,14.65,377.71,F,Clerk Typist 1-2,1 year,MORTGAGE,47800.0,Source Verified,0,debt_consolidation,Debt consolidation,SC


In [62]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   10000 non-null  int64  
 1   loan_amnt            10000 non-null  int64  
 2   term                 10000 non-null  object 
 3   int_rate             10000 non-null  float64
 4   installment          10000 non-null  float64
 5   grade                10000 non-null  object 
 6   emp_title            9301 non-null   object 
 7   emp_length           9346 non-null   object 
 8   home_ownership       10000 non-null  object 
 9   annual_inc           10000 non-null  float64
 10  verification_status  10000 non-null  object 
 11  default              10000 non-null  int64  
 12  purpose              10000 non-null  object 
 13  title                9861 non-null   object 
 14  addr_state           10000 non-null  object 
dtypes: float64(3), int64(3), object(9)
me

In [56]:
train.describe(include='all')

Unnamed: 0,id,loan_amnt,term,int_rate,installment,grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,default,purpose,title,addr_state
count,10000.0,10000.0,10000,10000.0,10000.0,10000,9301,9346,10000,10000.0,10000,10000.0,10000,9861,10000
unique,,,2,,,7,5985,11,6,,3,,14,882,43
top,,,36 months,,,D,Teacher,10+ years,MORTGAGE,,Source Verified,,debt_consolidation,Debt consolidation,GA
freq,,,7592,,,1955,168,3281,5590,,3829,,5837,4952,642
mean,4999.5,14223.6125,,13.170193,431.989699,,,,,73788.82,,0.1923,,,
std,2886.89568,8663.482432,,4.7569,259.788371,,,,,70672.25,,0.394127,,,
min,0.0,900.0,,5.31,20.22,,,,,0.0,,0.0,,,
25%,2499.75,7700.0,,9.71,243.49,,,,,45000.0,,0.0,,,
50%,4999.5,12000.0,,12.73,371.28,,,,,63000.0,,0.0,,,
75%,7499.25,20000.0,,15.99,573.125,,,,,90000.0,,0.0,,,


In [57]:
from sklearn.preprocessing import LabelEncoder

for col in train.columns:
    if train[col].dtype == object:
        lbl = LabelEncoder().fit(list(train[col]) + list(test[col]))
        train[col] = lbl.transform(train[col])
        test[col] = lbl.transform(test[col])

In [49]:
train['home_ownership'].value_counts()

MORTGAGE    5590
RENT        3304
OWN         1102
OTHER          2
NONE           1
ANY            1
Name: home_ownership, dtype: int64

In [50]:
test['home_ownership'].value_counts()

RENT        4647
MORTGAGE    4267
OWN         1084
NONE           1
ANY            1
Name: home_ownership, dtype: int64

In [63]:
# 'months'替换为空字符串，然后将结果转换为整数类型。
train['term'] = train['term'].str.replace('months', '').astype(int)
test['term'] = test['term'].str.replace('months', '').astype(int)

# grade列的值映射为等级列表['G', 'F', 'E', 'D', 'C', 'B', 'A']
train['grade'] = train['grade'].apply(lambda x: ['G', 'F', 'E', 'D', 'C', 'B', 'A'].index(x))
test['grade'] = test['grade'].apply(lambda x: ['G', 'F', 'E', 'D', 'C', 'B', 'A'].index(x))

# 用于将emp_length列的年份值进行转换。根据不同的情况，将年份转换为对应的数值
def convert_year(year):
    if year is None:
        return np.nan
    
    if '<' in str(year):
        return 0
    elif '+' in str(year):
        return 10
    elif str(year) == 'nan':
        return np.nan
    elif len(str(year)) > 1:
        return int(str(year).replace(' years', '').replace(' year', ''))
    else:
        return np.nan

train['emp_length'] = train['emp_length'].map(convert_year)
test['emp_length'] = test['emp_length'].map(convert_year)

# 将train和test中的home_ownership列中的特定值进行替换。将'OTHER'、'NONE'和'ANY'替换为'MORTGAGE'。
train['home_ownership'] = train['home_ownership'].replace('OTHER', 'MORTGAGE')
train['home_ownership'] = train['home_ownership'].replace('NONE', 'MORTGAGE')
train['home_ownership'] = train['home_ownership'].replace('ANY', 'MORTGAGE')

test['home_ownership'] = test['home_ownership'].replace('OTHER', 'MORTGAGE')
test['home_ownership'] = test['home_ownership'].replace('NONE', 'MORTGAGE')
test['home_ownership'] = test['home_ownership'].replace('ANY', 'MORTGAGE')

# 将train和test中的purpose列中的特定值进行替换。将'wedding'、'renewable_energy'和'educational'替换为'debt_consolidation'。
for x in ['wedding', 'renewable_energy', 'educational']:
    train['purpose'] = train['purpose'].replace(x, 'debt_consolidation')
    test['purpose'] = test['purpose'].replace(x, 'debt_consolidation')
    
# 将train和test中的title列中只出现一次的值替换为'Other'。首先计算train['title']列中只出现一次的值，然后依次替换
outlier_title = train['title'].value_counts()[train['title'].value_counts() == 1].index
for x in outlier_title:
    train['title'] = train['title'].replace(x, 'Other')
    test['title'] = test['title'].replace(x, 'Other')

In [64]:
from sklearn.preprocessing import LabelEncoder
for col in ['emp_title', 'home_ownership', 'verification_status', 'purpose', 
                         'title', 'addr_state']:
    
    lbl = LabelEncoder().fit(list(train[col]) + list(test[col]))
    train[col] = lbl.transform(train[col])  # 对train中的当前列进行标签编码转换
    test[col] = lbl.transform(test[col])  # 对test中的当前列进行标签编码转换

In [67]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(
    train.drop(['default', 'id'], axis=1).fillna(0),
    train['default'],
)

# 测试AUC计算
roc_auc_score(test_label['default'], clf.predict_proba(test.drop('id', axis=1).fillna(0))[:, 1])

0.5505585761823524

In [70]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=25)
clf.fit(
    train.drop(['default', 'id'], axis=1).fillna(0),
    train['default'],
)

# 测试AUC计算
roc_auc_score(test_label['default'], clf.predict_proba(test.drop('id', axis=1).fillna(0))[:, 1])

0.6647793676210575

In [89]:
from lightgbm import LGBMClassifier

# 模型训练
clf = LGBMClassifier(max_depth=10, n_estimators=50)
clf.fit(
    train.drop(['default', 'id'], axis=1),
    train['default'],
    # categorical_feature=['emp_title', 'home_ownership', 'verification_status', 'purpose', 
    #                      'title', 'addr_state']
)

# 测试AUC计算
roc_auc_score(test_label['default'], clf.predict_proba(test.drop('id', axis=1))[:, 1])

0.701441218446277

In [91]:
from lightgbm import LGBMClassifier

# 模型训练
clf_lgb = LGBMClassifier(max_depth=11, n_estimators=250)
clf_lgb.fit(
    train.drop(['default', 'id'], axis=1),
    train['default'],
    categorical_feature=['emp_title', 'home_ownership', 'verification_status', 'purpose', 
                         'title', 'addr_state']
)

# 测试AUC计算
roc_auc_score(test_label['default'], clf_lgb.predict_proba(test.drop('id', axis=1))[:, 1])



0.6941916939282815

In [20]:
from catboost import CatBoostClassifier 

# 模型训练
clf = CatBoostClassifier(max_depth=5, n_estimators=200)
clf.fit(
    train.drop(['default', 'id'], axis=1),
    train['default'],
)

# 测试AUC计算
roc_auc_score(test_label['default'], clf.predict_proba(test.drop('id', axis=1))[:, 1])

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Learning rate set to 0.120473
0:	learn: 0.6297756	total: 4.19ms	remaining: 835ms
1:	learn: 0.5852960	total: 8.77ms	remaining: 868ms
2:	learn: 0.5495187	total: 13.2ms	remaining: 864ms
3:	learn: 0.5256593	total: 16.8ms	remaining: 822ms
4:	learn: 0.5051292	total: 20.4ms	remaining: 796ms
5:	learn: 0.4904332	total: 28.5ms	remaining: 920ms
6:	learn: 0.4789683	total: 33.4ms	remaining: 922ms
7:	learn: 0.4694409	total: 42.7ms	remaining: 1.02s
8:	learn: 0.4619165	total: 47.8ms	remaining: 1.01s
9:	learn: 0.4557965	total: 51.6ms	remaining: 980ms
10:	learn: 0.4507077	total: 55.2ms	remaining: 949ms
11:	learn: 0.4466052	total: 58.6ms	remaining: 919ms
12:	learn: 0.4433313	total: 61.8ms	remaining: 890ms
13:	learn: 0.4412009	total: 65.3ms	remaining: 867ms
14:	learn: 0.4387951	total: 68.8ms	remaining: 848ms
15:	learn: 0.4368940	total: 72.1ms	remaining: 829ms
16:	learn: 0.4356078	total: 75.9ms	remaining: 817ms
17:	learn: 0.4338879	total: 80.1ms	remaining: 810ms
18:	learn: 0.4325164	total: 83.4ms	remaining

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


0.7099427627322975

In [92]:
from catboost import CatBoostClassifier 

# 模型训练
clf1 = CatBoostClassifier(max_depth=5, n_estimators=500, random_seed=100)
clf1.fit(
    train.drop(['default', 'id'], axis=1),
    train['default'],
    cat_features=[5,7,9, 10]
)

# 模型是否能使用多核？
# 线性模型、深度学习模型 可以使用多核 
# lightgbm、catboost、xgboost 可以使用多核
# sklearn 决策树、随机森林 不能使用多核

# 测试AUC计算
roc_auc_score(test_label['default'], clf1.predict_proba(test.drop('id', axis=1))[:, 1])

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Learning rate set to 0.051997
0:	learn: 0.6664978	total: 54.1ms	remaining: 27s
1:	learn: 0.6433883	total: 69.7ms	remaining: 17.4s
2:	learn: 0.6203407	total: 95.5ms	remaining: 15.8s
3:	learn: 0.6007294	total: 121ms	remaining: 15.1s
4:	learn: 0.5844673	total: 128ms	remaining: 12.7s
5:	learn: 0.5707289	total: 140ms	remaining: 11.5s
6:	learn: 0.5568666	total: 159ms	remaining: 11.2s
7:	learn: 0.5464453	total: 170ms	remaining: 10.4s
8:	learn: 0.5371010	total: 192ms	remaining: 10.5s
9:	learn: 0.5271063	total: 232ms	remaining: 11.4s
10:	learn: 0.5177997	total: 262ms	remaining: 11.6s
11:	learn: 0.5104599	total: 280ms	remaining: 11.4s
12:	learn: 0.5040027	total: 300ms	remaining: 11.2s
13:	learn: 0.4996506	total: 311ms	remaining: 10.8s
14:	learn: 0.4932673	total: 391ms	remaining: 12.6s
15:	learn: 0.4869122	total: 409ms	remaining: 12.4s
16:	learn: 0.4818863	total: 426ms	remaining: 12.1s
17:	learn: 0.4785413	total: 441ms	remaining: 11.8s
18:	learn: 0.4738891	total: 465ms	remaining: 11.8s
19:	learn:

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


0.7175194387518076

In [94]:
from catboost import CatBoostClassifier 

# 模型训练
clf2 = CatBoostClassifier(max_depth=5, n_estimators=500, random_seed=200)
clf2.fit(
    train.drop(['default', 'id'], axis=1),
    train['default'],
    cat_features=[5,7,9, 10]
)

# 模型是否能使用多核？
# 线性模型、深度学习模型 可以使用多核 
# lightgbm、catboost、xgboost 可以使用多核
# sklearn 决策树、随机森林 不能使用多核

# 测试AUC计算
roc_auc_score(test_label['default'], clf2.predict_proba(test.drop('id', axis=1))[:, 1])

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Learning rate set to 0.051997
0:	learn: 0.6670847	total: 23.9ms	remaining: 11.9s
1:	learn: 0.6432030	total: 38.7ms	remaining: 9.63s
2:	learn: 0.6211522	total: 51.2ms	remaining: 8.48s
3:	learn: 0.6013435	total: 63.4ms	remaining: 7.87s
4:	learn: 0.5835132	total: 75ms	remaining: 7.42s
5:	learn: 0.5679213	total: 95.4ms	remaining: 7.86s
6:	learn: 0.5541948	total: 164ms	remaining: 11.5s
7:	learn: 0.5419520	total: 194ms	remaining: 11.9s
8:	learn: 0.5339371	total: 222ms	remaining: 12.1s
9:	learn: 0.5254609	total: 242ms	remaining: 11.9s
10:	learn: 0.5163711	total: 258ms	remaining: 11.5s
11:	learn: 0.5094000	total: 272ms	remaining: 11s
12:	learn: 0.5018661	total: 288ms	remaining: 10.8s
13:	learn: 0.4956816	total: 307ms	remaining: 10.7s
14:	learn: 0.4900293	total: 382ms	remaining: 12.3s
15:	learn: 0.4846990	total: 412ms	remaining: 12.5s
16:	learn: 0.4796424	total: 437ms	remaining: 12.4s
17:	learn: 0.4755464	total: 462ms	remaining: 12.4s
18:	learn: 0.4725757	total: 473ms	remaining: 12s
19:	learn: 

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


0.7175745398662927

In [95]:
roc_auc_score(test_label['default'], 
              clf1.predict_proba(test.drop('id', axis=1))[:, 1] * 0.5 + clf2.predict_proba(test.drop('id', axis=1))[:, 1] * 0.5)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


0.718534838937442

In [38]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_validate
# 模型训练
clf_lgb = LGBMClassifier(max_depth=10, n_estimators=50, learning_rate=0.05)

cv = cross_validate(
    clf_lgb,
    train.drop(['default', 'id'], axis=1),
    train['default'],
    # categorical_feature=['emp_title', 'home_ownership', 'verification_status', 'purpose', 
    #                      'title', 'addr_state'],
    return_estimator=True
)

# 测试AUC计算
# roc_auc_score(test_label['default'], clf_lgb.predict_proba(test.drop('id', axis=1))[:, 1])

In [41]:
cv

{'fit_time': array([0.23169899, 0.08210301, 0.21644616, 0.0629108 , 0.09427023]),
 'score_time': array([0.00626421, 0.00749922, 0.00465083, 0.00495386, 0.00457096]),
 'estimator': [LGBMClassifier(learning_rate=0.05, max_depth=10, n_estimators=50),
  LGBMClassifier(learning_rate=0.05, max_depth=10, n_estimators=50),
  LGBMClassifier(learning_rate=0.05, max_depth=10, n_estimators=50),
  LGBMClassifier(learning_rate=0.05, max_depth=10, n_estimators=50),
  LGBMClassifier(learning_rate=0.05, max_depth=10, n_estimators=50)],
 'test_score': array([0.8075, 0.8105, 0.8155, 0.817 , 0.8085])}