In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score

In [3]:
# 构建含缺失值的示例数据集
data = pd.DataFrame({
    'MonthlyCharges': [29.9, np.nan, 50.0, 70.0, np.nan, 90.0, 110.0, 130.0],
    'tenure': [1, 2, 3, 4, 5, 6, 7, 8],
    'Churn': [1, 1, 0, 0, 0, 0, 0, 1]
})

In [4]:
X = data.drop("Churn",axis=1)
y = data['Churn']
print("原始数据（含NAN）：",X)

原始数据（含NAN）：    MonthlyCharges  tenure
0            29.9       1
1             NaN       2
2            50.0       3
3            70.0       4
4             NaN       5
5            90.0       6
6           110.0       7
7           130.0       8


# 方法1：sklearn决策树+缺失值处理

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X_filled = imputer.fit_transform(X)

dt = DecisionTreeClassifier(
    criterion='gini',
    max_depth=3,  # 最大深度
    min_samples_split=2,    # 内部节点最少样本数
    min_samples_leaf=1,     # 叶子节点最少样本数
    random_state=42
)
dt.fit(X_filled,y)
y_prob = dt.predict_proba(X_filled)
print('【sklearn+填充指示器】:',X_filled.shape)
print("【sklearn+中位数填充】预测结果：\n")
print(dt.predict(X_filled))

【sklearn+填充指示器】: (8, 2)
【sklearn+中位数填充】预测结果：

[1 1 0 0 0 0 0 1]


# 方法2：填充+缺失指示器

In [6]:
from sklearn.impute import MissingIndicator
from sklearn.pipeline import FeatureUnion
# 合并原始特征（填充后）和缺失指示器
union = FeatureUnion([
    ('features',SimpleImputer(strategy='median')),
    ('indicators',MissingIndicator())
])
X_enhanced = union.fit_transform(X)
dt1 = DecisionTreeClassifier(criterion='gini',random_state=42)
dt1.fit(X_enhanced,y)
print('【sklearn+填充指示器】:',X_enhanced.shape)
print('预测结果:',dt1.predict(X_enhanced))

【sklearn+填充指示器】: (8, 3)
预测结果: [1 1 0 0 0 0 0 1]


# 方法3：XGBoost（原生支持缺失值）

In [7]:
dtrain = xgb.DMatrix(X, label=y,missing=np.nan) # 显式指定缺失值标识

params = {
    'objective':'binary:logistic',
    'eval_metric':'logloss',
    'max_depth':3,
    'learning_rate':0.1,
    'seed':42
}
model_xgb = xgb.train(params,dtrain,num_boost_round=10)

pred_xgb = model_xgb.predict(dtrain)
print('【XGBoost原生支持缺失】预测概率：',pred_xgb)
print('预测类别：',(pred_xgb>0.5).astype(int))

【XGBoost原生支持缺失】预测概率： [0.37629378 0.37629378 0.37629378 0.37629378 0.37629378 0.37629378
 0.37629378 0.37629378]
预测类别： [0 0 0 0 0 0 0 0]


# 方法 4：LightGBM支持NaN

In [9]:
train_data = lgb.Dataset(X,label=y)

params_lgb = {
    'objective':'binary',
    'metric':'binary_logloss',
    'max_depth':3,
    'learning_rate':0.1,
    'verbose':-1,
    'seed':42
}
model_lgb = lgb.train(params_lgb,train_data,num_boost_round=10)
pred_lgb = model_lgb.predict(X)
print('【LightGBM原生支持缺失】预测概率：',pred_lgb)
print('预测类别',(pred_lgb>0.5).astype(int))

【LightGBM原生支持缺失】预测概率： [0.375 0.375 0.375 0.375 0.375 0.375 0.375 0.375]
预测类别 [0 0 0 0 0 0 0 0]


# 决策树预测用户流失

In [8]:
from sklearn.tree import DecisionTreeClassifier,export_text
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('E:\\桌面\\数据分析项目实战\\data\\user_loss.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [10]:
df['TotalCharges']=df['TotalCharges'].replace('^\s*$',np.nan,regex=True).pipe(pd.to_numeric,errors = 'coerce').fillna(df['MonthlyCharges']*df['tenure'])

In [11]:
(df['TotalCharges']==' ').sum()

0

In [17]:
df_clean = df.copy()
categorical_cols = [
    'gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
    'PaperlessBilling', 'PaymentMethod'
]
df_encoded = pd.get_dummies(df_clean,columns=categorical_cols,drop_first=True)
bool_cols = df_encoded.select_dtypes(include=['bool']).columns
if len(bool_cols) > 0:
    df_encoded[bool_cols] = df_encoded[bool_cols].astype(int)
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 32 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   customerID                             7043 non-null   object 
 1   SeniorCitizen                          7043 non-null   int64  
 2   tenure                                 7043 non-null   int64  
 3   MonthlyCharges                         7043 non-null   float64
 4   TotalCharges                           7043 non-null   float64
 5   Churn                                  7043 non-null   object 
 6   gender_Male                            7043 non-null   int32  
 7   Partner_Yes                            7043 non-null   int32  
 8   Dependents_Yes                         7043 non-null   int32  
 9   PhoneService_Yes                       7043 non-null   int32  
 10  MultipleLines_No phone service         7043 non-null   int32  
 11  Mult

In [22]:
X = df_encoded.drop(['customerID','Churn'],axis=1)
y = (df_encoded['Churn']=='Yes').astype(int)
print(f"最终维度：{X.shape}")
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify=y,random_state=42)
dt = DecisionTreeClassifier(max_depth=3,random_state=42,class_weight='balanced')
dt.fit(X_train,y_train)

print(export_text(dt,feature_names=X_train.columns.tolist()))

最终维度：(7043, 30)
|--- Contract_Two year <= 0.50
|   |--- Contract_One year <= 0.50
|   |   |--- InternetService_Fiber optic <= 0.50
|   |   |   |--- class: 1
|   |   |--- InternetService_Fiber optic >  0.50
|   |   |   |--- class: 1
|   |--- Contract_One year >  0.50
|   |   |--- StreamingMovies_Yes <= 0.50
|   |   |   |--- class: 0
|   |   |--- StreamingMovies_Yes >  0.50
|   |   |   |--- class: 0
|--- Contract_Two year >  0.50
|   |--- MonthlyCharges <= 103.33
|   |   |--- PaymentMethod_Electronic check <= 0.50
|   |   |   |--- class: 0
|   |   |--- PaymentMethod_Electronic check >  0.50
|   |   |   |--- class: 0
|   |--- MonthlyCharges >  103.33
|   |   |--- TotalCharges <= 7759.77
|   |   |   |--- class: 0
|   |   |--- TotalCharges >  7759.77
|   |   |   |--- class: 0

