# 赛题理解

# EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss

import category_encoders as ce
warnings.filterwarnings('ignore')

In [2]:
# 解决现实不出中文的问题
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False  #解决负数坐标显示问题
plt.rcParams['figure.figsize'] = (16, 4)   # 设置图形大小
plt.rcParams.update({"font.size":20})    #设置标题标注和字体大小 #此处必须添加此句代码方可改变标题字体大小
# 调用matplotlib.pyplot的绘图函数时，或者生成一个figure画布的时候，可以直接在python console里面生成图像。
%matplotlib inline  

In [3]:
train = pd.read_csv('train.csv')
testA = pd.read_csv('testA.csv')

In [4]:
#设置chunksize参数，来控制每次迭代数据的大小
chunker = pd.read_csv("./train.csv",chunksize=8000)
for item in chunker:
    print(type(item))
    #<class 'pandas.core.frame.DataFrame'>
    print(len(item))
    #5

<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pandas.core.frame.DataFrame'>
8000
<class 'pan

# 特征工程

特征工程与EDA之间有部分重合，是因为特征工程会用到EDA的结果。

## 数据预处理

### 去除唯一值

In [5]:
# 删除唯一值的特征
uniq_value_feas = [col for col in train.columns if train[col].nunique() <= 1]
data_train = train.drop(columns = uniq_value_feas)

### 填充缺失值

In [6]:
# 获取缺失值
def get_missing_feas(data):
    missing = data.isnull().sum()
    missing = missing[missing > 0]
    return missing.to_dict()

In [7]:
train_missing  = get_missing_feas(data_train)
train_missing

{'employmentTitle': 1,
 'employmentLength': 46799,
 'postCode': 1,
 'dti': 239,
 'pubRecBankruptcies': 405,
 'revolUtil': 531,
 'title': 1,
 'n0': 40270,
 'n1': 40270,
 'n2': 40270,
 'n2.1': 40270,
 'n4': 33239,
 'n5': 40270,
 'n6': 40270,
 'n7': 40270,
 'n8': 40271,
 'n9': 40270,
 'n10': 33239,
 'n11': 69752,
 'n12': 40270,
 'n13': 40270,
 'n14': 40270}

In [8]:
test_missing = get_missing_feas(testA)
test_missing

{'employmentLength': 11742,
 'dti': 61,
 'pubRecBankruptcies': 116,
 'revolUtil': 127,
 'n0': 10111,
 'n1': 10111,
 'n2': 10111,
 'n2.1': 10111,
 'n2.2': 10111,
 'n2.3': 10111,
 'n4': 8394,
 'n5': 10111,
 'n6': 10111,
 'n7': 10111,
 'n8': 10111,
 'n9': 10111,
 'n10': 8394,
 'n11': 17575,
 'n12': 10111,
 'n13': 10111,
 'n14': 10111}

In [9]:
def found_different_col(data1,data2):
    data1_col = data1.columns
    data2_col = data2.columns
    dif_col_more1 = list([col for col in data1_col if col not in data2_col])
    dif_col_more2 = [col for col in data2_col if col not in data1_col]
    return print('数据1比数据2多的列：',dif_col_more1),print('数据2比数据1多的列：',dif_col_more2)

In [10]:
dif_col = found_different_col(train,testA)

数据1比数据2多的列： ['isDefault']
数据2比数据1多的列： ['n2.2', 'n2.3']


In [12]:
# employmentTitle, postCode, title只有一条空数据，相对于80w来说，可以忽略不计。我们直接删除这三行数据即可。
data_train = data_train.drop(index=(data_train.loc[data_train['employmentTitle'].isnull()].index))

In [13]:
mode_feas = ["employmentTitle", "postCode", "title"]
data_train = data_train.dropna(subset=mode_feas) # 删除特定列中包含缺失值的行(axis=0)或列(axis=1)
data_train = data_train.reset_index(drop=True)

## 异常值处理

## 数据分箱

## 特征交互

## 特征编码

## 特征筛选

# 训练模型