In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression as LR
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import toad

In [2]:
df = pd.read_csv('rankingcard.csv', index_col=0)
print(df.shape)
df.info()

(150000, 11)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 1 to 150000
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         120269 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    146076 non-null  

In [5]:
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [6]:
# 检查各个特征的缺失情况
df.isnull().sum()/df.shape[0]

SeriousDlqin2yrs                        0.000000
RevolvingUtilizationOfUnsecuredLines    0.000000
age                                     0.000000
NumberOfTime30-59DaysPastDueNotWorse    0.000000
DebtRatio                               0.000000
MonthlyIncome                           0.198207
NumberOfOpenCreditLinesAndLoans         0.000000
NumberOfTimes90DaysLate                 0.000000
NumberRealEstateLoansOrLines            0.000000
NumberOfTime60-89DaysPastDueNotWorse    0.000000
NumberOfDependents                      0.026160
dtype: float64

In [7]:
# 均值填充
df["NumberOfDependents"].fillna(df["NumberOfDependents"].mean(),inplace=True)

In [8]:
# 再次检查各个特征的缺失情况
df.isnull().sum()/df.shape[0]

SeriousDlqin2yrs                        0.000000
RevolvingUtilizationOfUnsecuredLines    0.000000
age                                     0.000000
NumberOfTime30-59DaysPastDueNotWorse    0.000000
DebtRatio                               0.000000
MonthlyIncome                           0.198207
NumberOfOpenCreditLinesAndLoans         0.000000
NumberOfTimes90DaysLate                 0.000000
NumberRealEstateLoansOrLines            0.000000
NumberOfTime60-89DaysPastDueNotWorse    0.000000
NumberOfDependents                      0.000000
dtype: float64

In [9]:
X = df.iloc[:,1:]
y = df["SeriousDlqin2yrs"]

df.loc[:,"MonthlyIncome"].isnull().sum()

29731

In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

def fill_missing_rf(X, y, fillcolumn):
    df = X.copy()
    # 待预测列
    fill_df = df.loc[:, fillcolumn]
    # 去除预测列，将剩下的列与标签组成数据集
    df = pd.concat([df.loc[:, df.columns != fillcolumn], pd.DataFrame(y)], axis=1)
    # 训练集和测试集
    y_train = fill_df[fill_df.notnull()]
    y_test = fill_df[fill_df.isnull()]
    X_train = df.iloc[y_train.index, :]
    X_test = df.iloc[y_test.index, :]
    # 预测
    rfr = RandomForestRegressor(n_estimators=100).fit(X_train, y_train)
    y_pred = rfr.predict(X_test)

    return y_pred

y_pred = fill_missing_rf(X,y,"MonthlyIncome")
y_pred.shape

(29731,)

In [11]:
# 填充原来的数据
df.loc[df.loc[:, 'MonthlyIncome'].isnull(), 'MonthlyIncome'] = y_pred
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         150000 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    150000 non-null  float64
dtype

In [12]:
df.to_csv("no_null_rankingcard.csv",index=False)

In [13]:
#描述性统计
df.describe([0.01,0.1,0.25,0.5,0.75,0.9,0.99]).T

Unnamed: 0,count,mean,std,min,1%,10%,25%,50%,75%,90%,99%,max
SeriousDlqin2yrs,150000.0,0.06684,0.249746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
RevolvingUtilizationOfUnsecuredLines,150000.0,6.048438,249.755371,0.0,0.0,0.002969,0.029867,0.154181,0.559046,0.981278,1.092956,50708.0
age,150000.0,52.295207,14.771866,0.0,24.0,33.0,41.0,52.0,63.0,72.0,87.0,109.0
NumberOfTime30-59DaysPastDueNotWorse,150000.0,0.421033,4.192781,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,98.0
DebtRatio,150000.0,353.005076,2037.818523,0.0,0.0,0.030874,0.175074,0.366508,0.868254,1267.0,4979.04,329664.0
MonthlyIncome,150000.0,5422.731349,13271.547939,0.0,0.0,0.18,1800.0,4416.0,7400.0,10782.0,23200.5,3008750.0
NumberOfOpenCreditLinesAndLoans,150000.0,8.45276,5.145951,0.0,0.0,3.0,5.0,8.0,11.0,15.0,24.0,58.0
NumberOfTimes90DaysLate,150000.0,0.265973,4.169304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,98.0
NumberRealEstateLoansOrLines,150000.0,1.01824,1.129771,0.0,0.0,0.0,0.0,1.0,2.0,2.0,4.0,54.0
NumberOfTime60-89DaysPastDueNotWorse,150000.0,0.240387,4.155179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,98.0


In [14]:
df = df[df['age']>0]
df = df[df['NumberOfTime30-59DaysPastDueNotWorse']<90]
df = df[df['NumberOfTime60-89DaysPastDueNotWorse']<90]
df = df[df['NumberOfTimes90DaysLate']<90]
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149730 entries, 0 to 149729
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      149730 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  149730 non-null  float64
 2   age                                   149730 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  149730 non-null  int64  
 4   DebtRatio                             149730 non-null  float64
 5   MonthlyIncome                         149730 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       149730 non-null  int64  
 7   NumberOfTimes90DaysLate               149730 non-null  int64  
 8   NumberRealEstateLoansOrLines          149730 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  149730 non-null  int64  
 10  NumberOfDependents                    149730 non-null  float64
dtype

In [15]:
df['SeriousDlqin2yrs'].value_counts()

0    139851
1      9879
Name: SeriousDlqin2yrs, dtype: int64

In [16]:
df.head(5)
print(df.shape)

(149730, 11)


In [17]:
df['SeriousDlqin2yrs'].value_counts()

0    139851
1      9879
Name: SeriousDlqin2yrs, dtype: int64

In [18]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=20,n_jobs=-1)
# 返回已经上采样后的数据和标签
X_old = df.iloc[:,1:]
y_old = df["SeriousDlqin2yrs"]
X,y = sm.fit_resample(X_old,y_old)
# 结果转为dataframe
X = pd.DataFrame(X) 
y = pd.DataFrame(y) 
y.value_counts()

SeriousDlqin2yrs
0                   139851
1                   139851
dtype: int64

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=20) 

train_data = pd.concat([y_train,X_train], axis=1)
train_data.reset_index(drop=True, inplace=True) 

test_data = pd.concat([y_test, X_test], axis=1)
test_data.reset_index(drop=True, inplace=True)

train_data.to_csv('train_data.csv',index = False)
test_data.to_csv('test_data.csv',index = False)

In [20]:
train_data['SeriousDlqin2yrs'].value_counts()

1    105055
0    104721
Name: SeriousDlqin2yrs, dtype: int64

In [21]:
test_data['SeriousDlqin2yrs'].value_counts()

0    35130
1    34796
Name: SeriousDlqin2yrs, dtype: int64

In [None]:
# 接下来进行Toad的相关操作