In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import _rebuild
import pprint

from sklearn.model_selection import cross_val_score,KFold,GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor


from sklearn.metrics import mean_squared_error

# from sklearn.metrics import mean_squared_error
#    指的是调用一个计算MES（平均方差的函数）
# lightgbm：
# xgboost：

# 显示所有列
pd.set_option('display.max_columns', None)

# 显示所有行
pd.set_option('display.max_rows', None)

In [2]:
# 导入数据
train_data = pd.read_csv("D:\\program_tools\\jupyter\\tianchi\\happiness_train_abbr.csv", encoding='ISO-8859-1')

test_data = pd.read_csv("D:\\program_tools\\jupyter\\tianchi\\happiness_test_abbr.csv", encoding='ISO-8859-1')

# 1. 数据清理

In [3]:
# 现将负值赋值为Nan   
cols = list(train_data.columns)
del cols[cols.index('survey_time')]  # 去掉时间行
for name in cols:
    train_data.loc[train_data[name]<0,name] = np.nan

# 将happiness中的异常值所在的行删除
train_data.dropna(subset=["happiness"], inplace=True, axis=0)
# 将连续型变量用0代替
train_data[['family_m', 'income', 'family_income', 'house']].fillna(0)
# 将离散型变量用中位数代替
train_data.fillna(train_data.median(), inplace=True)
# 查看处理后的数据
train_data[['family_m', 'income', 'family_income', 'house']].head(10)

Unnamed: 0,family_m,income,family_income,house
0,2.0,20000.0,60000.0,1.0
1,3.0,20000.0,40000.0,1.0
2,3.0,2000.0,8000.0,1.0
3,3.0,6420.0,12000.0,1.0
4,4.0,20000.0,40000.0,1.0
5,2.0,5000.0,5000.0,1.0
6,3.0,20000.0,40000.0,1.0
7,2.0,20000.0,40000.0,1.0
8,1.0,1600.0,1600.0,1.0
9,5.0,60000.0,60000.0,1.0


In [4]:
cols = list(train_data.columns)

del cols[cols.index("happiness")]# 测试集合中没有happiness，
del cols[cols.index('survey_time')]  # 去掉时间行
for name in cols:
    test_data.loc[test_data[name]<0,name] = np.nan
# 因为测试集中不存在异常值，所以无需处理
# 将连续型变量用0代替
test_data[['family_m', 'income', 'family_income', 'house']].fillna(0)
# 将离散型变量用中位数代替
test_data.fillna(test_data.median(), inplace=True)
# test_data[['family_m', 'income', 'family_income', 'house']].head(10)


#  2.特征列处理

In [5]:
d = {1:0,2:0,3:0,4:1,5:1}  # 创建为一个新的二分类特征happiness_
train_data['happiness_new'] = train_data.happiness.map(d)

In [6]:
# 训练集重新分类
train_data.province.replace([1,2,6,8,12,13,21,22,24,26,28,29,31],1,inplace = True)
train_data.province.replace([5,9,16,17,23,30],2,inplace = True)
train_data.province.replace([3,4,7,10,11,15,18,19,27],3,inplace = True)


# 测试集重新分类
test_data.province.replace([1,2,6,8,12,13,21,22,24,26,28,29,31],1,inplace = True)
test_data.province.replace([5,9,16,17,23,30],2,inplace = True)
test_data.province.replace([3,4,7,10,11,15,18,19,27],3,inplace = True)

train_data.province.value_counts()

1.0    3874
3.0    2724
2.0    1390
Name: province, dtype: int64

In [7]:
# edu的重分类
train_data.edu.replace([1,2,3,4,5,6], 1, inplace=True)
train_data.edu.replace([7,8,9,10,11,12], 2, inplace=True)
train_data.edu.replace([13], 3, inplace=True)
train_data.edu.replace([14], 4, inplace=True)


test_data.edu.replace([1,2,3,4,5,6], 1, inplace=True)
test_data.edu.replace([7,8,9,10,11,12], 2, inplace=True)
test_data.edu.replace([13], 3, inplace=True)
test_data.edu.replace([14], 4, inplace=True)

In [8]:
# 其他党派，人数太少，合并于4
train_data.political.replace([3.0], 4.0, inplace=True)
test_data.political.replace([3.0], 4.0, inplace=True)

In [9]:
# 新增年龄列
def age(data):
    data['survey_time'] = pd.to_datetime(data['survey_time'])
    data.survey_time = pd.DatetimeIndex(data.survey_time).year
    data['age']=data.survey_time-data.birth
    return data
train_data=age(train_data)
test_data=age(test_data)

In [10]:
# 分类处理,将10种人中结果相近的合并为同一个集合

train_data["class"].replace([1,2], 1, inplace=True) # 第一类 class={1,2}
train_data["class"].replace([3], 2, inplace=True)  # 第一类 class={3}
train_data["class"].replace([4], 3, inplace=True)  # 第一类 class={4}
train_data["class"].replace([5,6,7,8,9,10], 4, inplace=True)  # 第一类 class={5,6,7,8,9,10}

test_data["class"].replace([1,2], 1, inplace=True) # 第一类 class={1,2}
test_data["class"].replace([3], 2, inplace=True)  # 第一类 class={3}
test_data["class"].replace([4], 3, inplace=True)  # 第一类 class={4}
test_data["class"].replace([5,6,7,8,9,10], 4, inplace=True)  # 第一类 class={5,6,7,8,9,10}

In [11]:
# 将大于四套房子的人归为含有四套
list_temp = set(train_data[train_data.house>4].house) 
list_temp
train_data.house.replace(list(list_temp), 4, inplace=True)
train_data.house.value_counts()

1.0    6387
2.0     907
0.0     510
3.0     133
4.0      51
Name: house, dtype: int64

In [12]:
# 将数据量少的数据归为第一类
train_data.hukou.replace([3,6,7,8], 1, inplace=True)
test_data.hukou.replace([3,6,7,8], 1, inplace=True)

In [13]:
# 对人口分布进行重分布
train_data.family_m.replace([9,11,50,13], 8, inplace=True)
test_data.family_m.replace([9,11,50,13], 8, inplace=True)

# 3.特征工程

### 3.1 获得训练集的标签值

In [14]:
train_y = train_data.happiness
train_y.shape

(7988,)

### 3.2 特征列的扩展与删除

In [15]:
train_data.shape

(7988, 44)

#### 3.2.1特征组合

In [16]:
# 特征组合,对重点指标进行相乘处理
Ordered_data_feature=['health', 'health_problem', 
                      'depression','relax','edu',
                      'marital','family_status',
                    'class','equity','status_peer',
                      'view','inc_ability','status_3_before']

def Ordered_data_feature_merge(data, features=Ordered_data_feature):
    for i in features:
        for j in features:
            if i != j:
                data[i+"*"+j] = data[i]*data[j]
            else:
                pass
    return data

In [17]:
train_data = Ordered_data_feature_merge(train_data)
test_data = Ordered_data_feature_merge(test_data)

In [18]:
print(train_data.shape) # 修改特征列之后

(7988, 200)


#### 3.2.2 特征列的去重

In [19]:
# 因为采用的是笛卡尔积，需要删除重复的特征列
# 比如a*b   和  b*a是相同的
# 因此需要去重
# 新增列是从45列开始,
def drop_dup(data=train_data):
    """
    思路：
        1. 通过列的平均值进行评比
        2. 只要我们不是同一列，并且平均值相等，就表示这是两个对称的向量积
        3. 保存该列的名字到列表中
    Args：
        data： 需要进行去重的数据集
    Returns:
        drop_cols：返回需要删除的列名
    """
    drop_cols=[] # 保存需要去重的列
    for column in data.columns[44:]:
        for temp in data.columns[44:]:
            if column != temp:
                if data[column].mean() == data[temp].mean():
                    drop_cols.append(temp)
    
    # 此时结果存在156列，但是存在对称向量如a*b，b*a
    # 开始进一步去重
    index_s = [] # 保存所有需要删除的列的下标位置
    temp=0
    d = 12# 步长
    for i in range(1,13):
        for j in range(1,i+1):
            temp = i*d+j
            index_s.append(temp)
    drop_cols_result = []
    
    # 找到需要删除的列名
    for i in index_s:
        drop_cols_result.append(drop_cols[i-1])
    # 计算结构为
    return drop_cols_result 

In [20]:
drop_columns = drop_dup()

# 删除数据集合中的列
train_data.drop(columns=drop_columns, inplace=True)
test_data.drop(columns=drop_columns, inplace=True)

#### 3.2.3无用特征列的删除

In [21]:
#     根据第九个板块找出的数据异常
#     work_status        2951 non-null float64
#     work_yr            2951 non-null float64
#     work_type          2951 non-null float64
#     work_manage        2951 non-null float64
#     family_income      7999 non-null float64  # 存在0的值
# 删除相关性小，数据量小的特征列

# 已经利用非数值型列survey_time 和birth 进行了年龄的计算
# 因此这两列的使用价值无效，可以直接删除
def feature_drop(data):
    data.drop(columns=["id"], inplace=True)
    data.drop(columns=["city","county"], inplace=True)
    data.drop(columns=["work_status","work_yr","work_type","work_manage"], inplace=True)
    data.drop(columns=["survey_time","birth"], inplace=True)
    return data

In [22]:
train_data = feature_drop(train_data)
test_data = feature_drop(test_data)

In [23]:
train_data.head(5) # 查看操作结果

Unnamed: 0,happiness,survey_type,province,gender,nationality,religion,religion_freq,edu,income,political,floor_area,height_cm,weight_jin,health,health_problem,depression,hukou,socialize,relax,learn,equity,class,work_exper,family_income,family_m,family_status,house,car,marital,status_peer,status_3_before,view,inc_ability,happiness_new,age,health_problem*health,depression*health,depression*health_problem,relax*health,relax*health_problem,relax*depression,edu*health,edu*health_problem,edu*depression,edu*relax,marital*health,marital*health_problem,marital*depression,marital*relax,marital*edu,family_status*health,family_status*health_problem,family_status*depression,family_status*relax,family_status*edu,family_status*marital,class*health,class*health_problem,class*depression,class*relax,class*edu,class*marital,class*family_status,equity*health,equity*health_problem,equity*depression,equity*relax,equity*edu,equity*marital,equity*family_status,equity*class,status_peer*health,status_peer*health_problem,status_peer*depression,status_peer*relax,status_peer*edu,status_peer*marital,status_peer*family_status,status_peer*class,status_peer*equity,view*health,view*health_problem,view*depression,view*relax,view*edu,view*marital,view*family_status,view*class,view*equity,view*status_peer,inc_ability*health,inc_ability*health_problem,inc_ability*depression,inc_ability*relax,inc_ability*edu,inc_ability*marital,inc_ability*family_status,inc_ability*class,inc_ability*equity,inc_ability*status_peer,inc_ability*view,status_3_before*health,status_3_before*health_problem,status_3_before*depression,status_3_before*relax,status_3_before*edu,status_3_before*marital,status_3_before*family_status,status_3_before*class,status_3_before*equity,status_3_before*status_peer,status_3_before*view,status_3_before*inc_ability
0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,20000.0,1.0,45.0,176.0,155.0,3.0,2.0,5.0,5.0,2.0,4.0,3.0,3.0,2.0,1.0,60000.0,2.0,2.0,1.0,2.0,3.0,3.0,2.0,4.0,3.0,1,56.0,6.0,15.0,10.0,12.0,8.0,20.0,6.0,4.0,10.0,8.0,9.0,6.0,15.0,12.0,6.0,6.0,4.0,10.0,8.0,4.0,6.0,6.0,4.0,10.0,8.0,4.0,6.0,4.0,9.0,6.0,15.0,12.0,6.0,9.0,6.0,6.0,9.0,6.0,15.0,12.0,6.0,9.0,6.0,6.0,9.0,12.0,8.0,20.0,16.0,8.0,12.0,8.0,8.0,12.0,12.0,9.0,6.0,15.0,12.0,6.0,9.0,6.0,6.0,9.0,9.0,12.0,6.0,4.0,10.0,8.0,4.0,6.0,4.0,4.0,6.0,6.0,8.0,6.0
1,4.0,2.0,3.0,1.0,1.0,1.0,1.0,2.0,20000.0,1.0,110.0,170.0,110.0,5.0,4.0,3.0,1.0,2.0,4.0,3.0,3.0,4.0,1.0,40000.0,3.0,4.0,1.0,2.0,1.0,1.0,1.0,4.0,2.0,1,23.0,20.0,15.0,12.0,20.0,16.0,12.0,10.0,8.0,6.0,8.0,5.0,4.0,3.0,4.0,2.0,20.0,16.0,12.0,16.0,8.0,4.0,20.0,16.0,12.0,16.0,8.0,4.0,16.0,15.0,12.0,9.0,12.0,6.0,3.0,12.0,12.0,5.0,4.0,3.0,4.0,2.0,1.0,4.0,4.0,3.0,20.0,16.0,12.0,16.0,8.0,4.0,16.0,16.0,12.0,4.0,10.0,8.0,6.0,8.0,4.0,2.0,8.0,8.0,6.0,2.0,8.0,5.0,4.0,3.0,4.0,2.0,1.0,4.0,4.0,3.0,1.0,4.0,2.0
2,4.0,2.0,1.0,2.0,1.0,0.0,3.0,1.0,2000.0,1.0,120.0,160.0,122.0,4.0,4.0,5.0,1.0,3.0,4.0,2.0,4.0,4.0,2.0,8000.0,3.0,3.0,1.0,2.0,3.0,2.0,1.0,4.0,2.0,1,48.0,16.0,20.0,20.0,16.0,16.0,20.0,4.0,4.0,5.0,4.0,12.0,12.0,15.0,12.0,3.0,12.0,12.0,15.0,12.0,3.0,9.0,16.0,16.0,20.0,16.0,4.0,12.0,12.0,16.0,16.0,20.0,16.0,4.0,12.0,12.0,16.0,8.0,8.0,10.0,8.0,2.0,6.0,6.0,8.0,8.0,16.0,16.0,20.0,16.0,4.0,12.0,12.0,16.0,16.0,8.0,8.0,8.0,10.0,8.0,2.0,6.0,6.0,8.0,8.0,4.0,8.0,4.0,4.0,5.0,4.0,1.0,3.0,3.0,4.0,4.0,2.0,4.0,2.0
3,5.0,2.0,3.0,2.0,1.0,1.0,1.0,1.0,6420.0,1.0,78.0,163.0,170.0,4.0,4.0,4.0,1.0,2.0,4.0,4.0,4.0,4.0,4.0,12000.0,3.0,3.0,1.0,1.0,7.0,2.0,1.0,3.0,2.0,1,72.0,16.0,16.0,16.0,16.0,16.0,16.0,4.0,4.0,4.0,4.0,28.0,28.0,28.0,28.0,7.0,12.0,12.0,12.0,12.0,3.0,21.0,16.0,16.0,16.0,16.0,4.0,28.0,12.0,16.0,16.0,16.0,16.0,4.0,28.0,12.0,16.0,8.0,8.0,8.0,8.0,2.0,14.0,6.0,8.0,8.0,12.0,12.0,12.0,12.0,3.0,21.0,9.0,12.0,12.0,6.0,8.0,8.0,8.0,8.0,2.0,14.0,6.0,8.0,8.0,4.0,6.0,4.0,4.0,4.0,4.0,1.0,7.0,3.0,4.0,4.0,2.0,3.0,2.0
4,4.0,1.0,3.0,2.0,1.0,1.0,1.0,2.0,20000.0,2.0,70.0,165.0,110.0,5.0,5.0,3.0,2.0,4.0,3.0,4.0,2.0,1.0,6.0,40000.0,4.0,3.0,1.0,1.0,1.0,3.0,2.0,3.0,2.0,1,21.0,25.0,15.0,15.0,15.0,15.0,9.0,10.0,10.0,6.0,6.0,5.0,5.0,3.0,3.0,2.0,15.0,15.0,9.0,9.0,6.0,3.0,5.0,5.0,3.0,3.0,2.0,1.0,3.0,10.0,10.0,6.0,6.0,4.0,2.0,6.0,2.0,15.0,15.0,9.0,9.0,6.0,3.0,9.0,3.0,6.0,15.0,15.0,9.0,9.0,6.0,3.0,9.0,3.0,6.0,9.0,10.0,10.0,6.0,6.0,4.0,2.0,6.0,2.0,4.0,6.0,6.0,10.0,10.0,6.0,6.0,4.0,2.0,6.0,2.0,4.0,6.0,6.0,4.0


In [24]:
# 拆分数据集
train_x=train_data.drop(columns=['happiness','happiness_new'])

#### 3.2.4 特征列的筛选
- 使用基于随机森林的嵌入法来计算特征的重要程度，去除相关性较弱的特征。

In [None]:
clf = RandomForestRegressor()
clf = clf.fit(train_x, train_y) # 通过随机森林训练数据
model = SelectFromModel(clf,prefit=True,threshold="median")
# 通过给定字符串参数使用内置的启发式方法寻找合适的阈值 median 

In [None]:
feature_bool = model.get_support()  # 经过随机森林筛选之后，返回的相对较好的特征
feature_bool

In [27]:
# feature_bool保存了index，为True的显示
def get_feature_names(data):
    indexs=[]
    feature_names=[]
    # 找到所有被选的特征列所在的下标
    for index,value in enumerate(feature_bool):
        if value == True:
            indexs.append(index)
    # 通过下标获取特征列的名字
    for i in indexs:
        feature_names.append(data.columns[i])
    return feature_names

In [28]:
feature_columns = get_feature_names(train_data)
train_x = train_x[feature_columns]
test_data = test_data[feature_columns]

In [29]:
print(len(train_x.columns))  # 筛选特征列之后的总列数
print(len(test_data.columns))

# 4. 建模评分

In [30]:
# # 数据样例
# data = array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6,])
# # 准备交叉验证
# kfold = KFold(n_splits=5, shuffle=True, random_state=3)
# # 枚举一下每一个子模型
# for train , test in kfold.split(data):
#     print(train)
#     print(test)

In [31]:
""" KFold方法详解：
    
    n_splits： 需要分成组的个数
    shuffle：   对数据进行洗牌
    random_state: 随机种子
"""

    
""" cross_val_score方法详解:

    clf: 使用的模型
    train_x：训练特征样本
    train_y： 训练标签
"""

k_folds =  10  # 指定分组个数
def get_rmse(model,train_x=train_x,train_y=train_y):
    """通过交叉验证，计算所有交叉组合的均值方差

    """
    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=10)
    score = cross_val_score(clf, train_x, train_y, scoring="neg_mean_squared_error", cv=kfold)
    return score

In [32]:
models = []
models.append(["Ridge",Ridge()])
models.append(["SVR",SVR(gamma="auto")])
models.append(["RandomForestRegressor",RandomForestRegressor(n_estimators=100)])
models.append(["GradientBoostingRegressor", GradientBoostingRegressor()]) 



for model_name,clf in models:
    print("{}{}".format(model_name,":"), end=" ")
    print(np.mean(get_rmse(clf)))


Ridge: -0.48258847550530265
SVR: -0.6704540937317058
RandomForestRegressor: -0.4836113663382487
GradientBoostingRegressor: -0.47426235582315057


- 通过均值方差的评比可以看出GradientBoostingRegressor的效果相对最好
- 因此选同GradientBoostingRegressor，进行参数调优

# 5. 参数调优

In [33]:
""" GBR参数详解：
    learning_rate: 学习速率
    n_estimators: 指GBR学习的算法（弱学习器）的数量，默认值为 100
    max_depth: 每一个弱学习器都是一个决策树，max_depth决定了树生成的结点数目
                - 更多的叶子节点数会产生过拟合的现象
                - 默认值是 3
    loss: 选择损失函数，默认值为ls(least squres)最小二乘法
    min_samples_leaf: 可以划分内部结点的最小样本数， 默认为2
    min_samples_split: 叶结点所需的最小样本数
    max_features：单个随机森林允许使用的特征数量额最大数量
        - Auto/None ：简单地选取所有特征，每颗树都可以利用他们。
                       这种情况下，每颗树都没有任何的限制。
        - sqrt ：此选项是每颗子树可以利用总特征数的平方根个。
                 例如，如果变量（特征）的总数是100，所以每颗子树只能取其中的10个。
        - 0.2 ： 此选项允许每个随机森林的子树可以利用变量（特征）数的20％
    subsample: 样本采样率
    
"""
params = {
    'learning_rate':0.01, 
    'n_estimators':600,  
    'max_depth':6,
    'min_samples_leaf':18,
    'min_samples_split':180,
    'max_features':'sqrt', 
    'subsample':0.8, 
}

In [34]:
# 这里使用GridSearchCV 来进行参数的自动调节  -- 适用于小数量集合

#### 5.1 调整迭代次数（n_estimators）
- 本小节主要负责调整n_estimators，初始学习速率设置为0.1，用于快速迭代

In [35]:
reg_params = {
    'learning_rate':0.1, 
    'n_estimators':600,  
    'max_depth':8,
    'min_samples_leaf':20,
    'min_samples_split':300,
    'max_features':'sqrt', 
    'subsample':0.8, 
    'random_state':10
}

In [36]:
model_gbdt = GradientBoostingRegressor(**reg_params) # 根据指定参数创建GBR模型
params_1 = {'n_estimators': range(100, 200, 10)}# 设置决策树的数量
kfold = KFold(n_splits=10, shuffle=True, random_state=2019)

gsearch = GridSearchCV(estimator=model_gbdt, param_grid=params_1, scoring='neg_mean_squared_error', cv=kfold, iid=False)
gsearch.fit(train_x, train_y) # 开始搜索，训练查找最佳参数值

print('best_params_：{}'.format(gsearch.best_params_)) # best_params_ : dict
print('best_score_: {}'.format(gsearch.best_score_))

best_params_：{'n_estimators': 100}
best_score_: -0.4739305207868319


#### 5.2 调整决策树的最大深度（max_depth）和 叶结点所需的最小样本数（min_samples_split）

In [37]:
model_gbdt = GradientBoostingRegressor(**reg_params) # 
params_2 = {'max_depth':range(3,8,1), 'min_samples_split':range(100,200,20)}
kfold = KFold(n_splits=10, shuffle=True, random_state=2018)

gsearch = GridSearchCV(estimator=model_gbdt, param_grid=params_2, scoring='neg_mean_squared_error', cv=kfold, iid=False )
gsearch.fit(train_x, train_y) 

print('best_params_：{}'.format(gsearch.best_params_)) # best_params_ : dict
print('best_score_: {}'.format(gsearch.best_score_))

best_params_：{'min_samples_split': 180, 'max_depth': 3}
best_score_: -0.48267970954617984


#### 5.3 调整决策树划分内部结点的最小样本数（min_samples_leaf）

In [38]:
model_gbdt = GradientBoostingRegressor(**reg_params) # 
params_3 = {'min_samples_leaf':range(14,20,2), 'min_samples_split':range(100,200,10)}
kfold = KFold(n_splits=10, shuffle=True, random_state=2018)

gsearch = GridSearchCV(estimator=model_gbdt, param_grid=params_3, scoring='neg_mean_squared_error', cv=kfold, iid=False )
gsearch.fit(train_x, train_y) 

print('best_params_：{}'.format(gsearch.best_params_)) # best_params_ : dict
print('best_score_: {}'.format(gsearch.best_score_))

best_params_：{'min_samples_leaf': 16, 'min_samples_split': 160}
best_score_: -0.5152558277350138


In [39]:
reg_params = {
    'learning_rate':0.01, 
    'n_estimators':600, 
    'max_depth':5,
    'min_samples_leaf':15,
    'min_samples_split':180,
    'max_features':'sqrt', 
    'subsample':0.75, 
    'random_state':10
}
model_gbdt = GradientBoostingRegressor(**reg_params)
model_gbdt.fit(train_x, train_y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.01, loss='ls', max_depth=5,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=15, min_samples_split=180,
                          min_weight_fraction_leaf=0.0, n_estimators=600,
                          n_iter_no_change=None, presort='auto',
                          random_state=10, subsample=0.75, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [40]:
y_pre = model_gbdt.predict(test_data)
happiness_submit = pd.read_csv('D:\\program_tools\\jupyter\\tianchi\\happiness_submit.csv')
happiness_submit['happiness'] = y_pre
happiness_submit.head()

Unnamed: 0,id,happiness
0,8001,3.794637
1,8002,2.60926
2,8003,3.499188
3,8004,4.341772
4,8005,3.284166


In [42]:
happiness_submit.to_csv("2019-11-23-version1_happiness1.csv",index=False)