In [3]:
"""读取数据以及导入包"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as LR
import pandas as pd
import seaborn as sns
#读取寻训练数据，为了使不同的端可以正常使用数据，故设置两段读取
#从kaggle训练集库读取
try:
    path = "E:\kaggledatabase\GiveMeSomeCredit\\"
    data = pd.read_csv(path+"cs-training.csv",index_col=0)#设置某列作为索引
except FileNotFoundError:#除非找不到文件，再从该目录下查找文件
    data = pd.read_csv("cs-training.csv",index_col=0)#设置某列作为索引
"""#去除重复值"""
#inplace=True表示替换原数据
data.drop_duplicates(inplace=True)#当两行数据完全一样就删除
#删除完记得恢复索引
data.index = range(data.shape[0])
"""#使用均值填补“家属”缺失值"""
data['NumberOfDependents'].fillna(int(data['NumberOfDependents'].mean()),inplace=True)
"""使用随机森林填补缺失值"""

def fill_missing_rf(X,y,to_fill):

    """
    使用随机森林填补一个特征的缺失值的函数

    参数：
    X：要填补的特征矩阵
    y：完整的，没有缺失值的标签
    to_fill：字符串，要填补的那一列的名称
    """

    #构建新特征矩阵和新标签
    df = X.copy()
    fill = df.loc[:,to_fill]
    df = pd.concat([df.loc[:,df.columns != to_fill],pd.DataFrame(y)],axis=1)

    # 找出我们的训练集和测试集
    Ytrain = fill[fill.notnull()]
    Ytest = fill[fill.isnull()]
    Xtrain = df.loc[Ytrain.index,:]
    Xtest = df.loc[Ytest.index,:]

    #用随机森林回归来填补缺失值
    from sklearn.ensemble import RandomForestRegressor as rfr
    rfr = rfr(n_estimators=100)
    rfr = rfr.fit(Xtrain, Ytrain)
    Ypredict = rfr.predict(Xtest)

    return Ypredict

X = data.iloc[:,1:]
y = data["SeriousDlqin2yrs"]

#将参数输入函数
y_pred = fill_missing_rf(X,y,"MonthlyIncome")

#查看预测出来的与原来的缺失值数量是否对的上
if y_pred.shape == data.loc[data.loc[:,'MonthlyIncome'].isnull(),"MonthlyIncome"].shape:
    print("预测与缺失对得上！")
#进行覆盖
data.loc[data.loc[:,"MonthlyIncome"].isnull(),"MonthlyIncome"] = y_pred

"""异常值处理"""
data  = data[data['age'] != 0]
data = data[data.loc[:,'NumberOfTimes90DaysLate'] < 90]
#重置索引
data.index = range(data.shape[0])

"""处理不平衡数据"""
#特征矩阵
X = data.iloc[:,1: ]
#标签值
y = data.iloc[:,0]
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)#实例化
X,y = sm.fit_sample(X,y)
n_sample = X.shape[0]
n_1_sample = y.value_counts()[1]
n_0_sample = y.value_counts()[0]
print('SMOTE上采样后：样本个数:{};违约的占比{:.2%},不违约的占比{:.2%}'.format(n_sample,
                                                n_1_sample/n_sample,
                                                n_0_sample/n_sample))
"""分出测试集与训练集"""
from sklearn.model_selection import train_test_split
X = pd.DataFrame(X)
y = pd.DataFrame(y)
X_train,X_vali,Y_train,Y_vali = train_test_split(X,y,test_size=0.3,random_state=420)
#训练集
model_data = pd.concat([Y_train,X_train],axis=1)#肩并肩拼接在一起
model_data.index = range(model_data.shape[0])
model_data.columns = data.columns
#验证集
vali_data = pd.concat([Y_vali,X_vali],axis=1)
vali_data.index = range(vali_data.shape[0])
vali_data.columns = data.columns
#分别储存，便于后续操作
model_data.to_csv("./model_data.csv",index=None)
vali_data.to_csv("./vali_data.csv",index=None)


预测与缺失对得上！
SMOTE上采样后：样本个数:278584;违约的占比50.00%,不违约的占比50.00%
