## ライブラリのインポート

In [431]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import wxparams as wx
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## データの読み込み

In [432]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
df = pd.concat([train, test], axis=0, ignore_index=True)

## 特徴量生成
降水量のカラムを作りたい

In [433]:
df = df.sort_values('datetime')
df['datetime'] = pd.to_datetime(df['datetime'])
df['year'] = pd.to_datetime(df['datetime']).dt.strftime('%Y')
df['year_month'] = pd.to_datetime(df['datetime']).dt.strftime('%Y%m')
df['month'] = pd.to_datetime(df['datetime']).dt.strftime('%m').astype('int64')
df['day'] = pd.to_datetime(df['datetime']).dt.strftime('%d')
df['weekday'] = pd.to_datetime(df['datetime']).dt.strftime('%a')
df['hour'] = pd.to_datetime(df['datetime']).dt.strftime('%H')

df.loc[df['windspeed']==0, 'windspeed'] = df['windspeed'].median()

df['casual_ratio'] = df['casual']/df['count']
df['registered_ratio'] = df['registered']/df['count']
df['campaign_flg'] = 0
df.loc[df['casual_ratio']>0.95,'campaign_flg'] = 1
# df['casual_ratio'].loc[df['casual_ratio']>0.95].count()/df['casual_ratio'].count()

df['td'] = wx.RH_to_Td(df['temp'], df['humidity'], formula="Bolton") #露点温度
df['t_td'] = wx.T_Td(df['temp'], df['td']) #湿数(飽和水蒸気量に達するまでの温度差。0に近いと雨が降りやすい)
df['humidity_abs'] = wx.Absolute_Humidity(df['temp'], df['td'], formula="Bolton") #絶対湿度

# 月平均
# month_ave = df.groupby('month').mean()[['casual','registered','count']]
# df['month_ave_casual'] = 0
# df['month_ave_registered'] = 0
# df['month_ave_count'] = 0
# for month in range(1,13):
#     df.loc[df['month']==month, 'month_ave_casual'] = month_ave.iloc[month-1,0]
#     df.loc[df['month']==month, 'month_ave_registered'] = month_ave.iloc[month-1,1]
#     df.loc[df['month']==month, 'month_ave_count'] = month_ave.iloc[month-1,2]

# display(month_ave)
# display(df.head())
# display(df.tail(1000))
# df['moving_ave_3'] = df['count'].rolling(window= 3, center=True).mean() # 7日間移動平均
# df.dropna(inplace=True) #移動平均がNaNになった行を削除
# display(df.iloc[400:450,:]) #1/19と1/20の境目

## カラムの選択/ダミー変数化

In [434]:
df = df[[
    # 'datetime',
    'temp',    
    # 'humidity',
    # 'windspeed',
    'holiday',
    'workingday',
    'weather',
    'year',
    # 'year_month',
    # 'month_ave_casual',
    # 'month_ave_registered',
    # 'month',
    # 'weekday',
    # 't_td',
    # 'td',
    # 'humidity_abs',
    'hour',
    'casual',
    'registered',
    'count',
    'campaign_flg',
    # 'moving_ave_3',
    # 'moving_ave_14',   
    # 'casual_ratio',
    # 'registered_ratio',
]]

df = pd.get_dummies(df, columns=[
    # 'month',
    # 'weekday',
    # 'year_month',
    'year',
    'hour',
    'weather',
])

# 対数変換
# df['count'] = np.log(df['count']+1)
# df['casual'] = np.log(df['casual']+1)
# df['registered'] = np.log(df['registered']+1)

display(df.columns)

Index(['temp', 'holiday', 'workingday', 'casual', 'registered', 'count',
       'campaign_flg', 'year_2011', 'year_2012', 'hour_00', 'hour_01',
       'hour_02', 'hour_03', 'hour_04', 'hour_05', 'hour_06', 'hour_07',
       'hour_08', 'hour_09', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_20', 'hour_21', 'hour_22', 'hour_23', 'weather_1', 'weather_2',
       'weather_3', 'weather_4'],
      dtype='object')

## モデル構築

In [435]:
def pred(df, train, test, col_name):
    df = df.sort_index()
    train = df.iloc[:len(train),:].copy()
    test = df.iloc[len(train):,:].copy()

    train['casual'] = train['casual'].astype('int64')
    train['registered'] = train['registered'].astype('int64')
    train['count'] = train['count'].astype('int64')

    test.drop(['casual','registered','count'], axis=1, inplace=True)
    test = test.reset_index(drop=True)

    y = train.loc[:,[col_name]].values
    tmp = train[col_name]
    train.drop(['casual','registered','count'], axis=1, inplace=True)
    X = train.values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    y_train = y_train.ravel()   # 1    2d array -> 1d array 
    y_test  = y_test.ravel()    # 2    2d array -> 1d array 

    # データセットを生成する
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)


    
    # LightGBM のハイパーパラメータ
    max_depth = 10 # 木の深さ
    min_data_in_leaf = 5 # 葉っぱに落ちるデータの最小値
    feature_fraction = 0.9 # 木を作るときに使うパラメータの割合(1=全て)
    
    params = {
        'boosting_type': 'gbdt',
        'task': 'train',
        'objective': 'regression',

        'learning_rate': 0.1,
        'early_stopping_round': 50,
        'verbose': -1,
        'random_state': 0,

        'num_iterations': 1000,
        'max_depth': max_depth,
        'num_leaves': int(0.7*2**max_depth),
        'min_data_in_leaf': min_data_in_leaf,
        'feature_fraction': feature_fraction,
    }
    # num_iterationsはearlystoppingするから気にしなくていい
    #　max_depthは3〜8くらい。7くらいが無難
    # num_leaves 最重要。複雑さに直結。max_depthの自乗よりは大きくならない

    # 上記のパラメータでモデルを学習する
    model = lgb.train(params, lgb_train, valid_sets=lgb_eval)

    # テストデータを予測する
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    test_pred = model.predict(test)
    test_pred = pd.DataFrame(test_pred, columns=[col_name])
    
    #対数変換を戻す
    # test_pred[col_name] = np.exp(test_pred[col_name]) - 1

    # print(y_pred.max())
    return test_pred

In [436]:
# # 散布図を描画(真値 vs 予測値)
# plt.plot(y_test, y_test, color = 'red', label = 'x=y') # 直線y = x (真値と予測値が同じ場合は直線状に点がプロットされる)
# plt.scatter(y_test, y_pred) # 散布図のプロット
# plt.xlabel('y_train') # x軸ラベル
# plt.ylabel('y_test_pred') # y軸ラベル
# plt.title('y_train vs y_test_pred') # グラフタイトル
# plt.show()

In [437]:
# スコアを表示。小さいほどよい
def rmsle(real, predicted):
    sum=0.0
    for x in range(len(predicted)):
        if predicted[x]<0 or real[x]<0: #check for negative values
            continue
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5
# print("test score: ",rmsle(y_test, model.predict(X_test)))
# print("train score: ",rmsle(y_train, model.predict(X_train)))
casual = pred(df, train, test, 'casual')
registered = pred(df, train, test, 'registered')
count = pred(df, train, test, 'count')



[1]	valid_0's l2: 2198.85
Training until validation scores don't improve for 50 rounds
[2]	valid_0's l2: 1951.75
[3]	valid_0's l2: 1707.98
[4]	valid_0's l2: 1518.61
[5]	valid_0's l2: 1373.29
[6]	valid_0's l2: 1239.08
[7]	valid_0's l2: 1152.23
[8]	valid_0's l2: 1052.84
[9]	valid_0's l2: 966.098
[10]	valid_0's l2: 894.865
[11]	valid_0's l2: 833.675
[12]	valid_0's l2: 781.319
[13]	valid_0's l2: 736.855
[14]	valid_0's l2: 694.583
[15]	valid_0's l2: 658.95
[16]	valid_0's l2: 630.701
[17]	valid_0's l2: 601.283
[18]	valid_0's l2: 582.458
[19]	valid_0's l2: 567.929
[20]	valid_0's l2: 547.787
[21]	valid_0's l2: 530.52
[22]	valid_0's l2: 516.225
[23]	valid_0's l2: 501.972
[24]	valid_0's l2: 485.78
[25]	valid_0's l2: 477.258
[26]	valid_0's l2: 464.916
[27]	valid_0's l2: 458.03
[28]	valid_0's l2: 447.685
[29]	valid_0's l2: 442.549
[30]	valid_0's l2: 437.311
[31]	valid_0's l2: 430.702
[32]	valid_0's l2: 426.774
[33]	valid_0's l2: 421.454
[34]	valid_0's l2: 418.647
[35]	valid_0's l2: 414.002
[36]	va

In [438]:
submission  = pd.read_csv('input/sampleSubmission.csv')
datetime = pd.DataFrame(submission['datetime'])
result = pd.concat([datetime, casual, registered, count], axis=1)

result['casual'] = result['casual'].astype('int64')
result['registered'] = result['registered'].astype('int64')

result['casual_plus_registered'] = result['casual'] + result['registered']
result['diff'] = result['count'] - result['casual_plus_registered']
# result.describe()

result['count'] = result['count'].astype('int64')
# result['casual_plus_registered'] = result['casual_plus_registered'].astype('int64')
result.loc[result['count']<0, 'count'] = 0
result.loc[result['casual_plus_registered']<0, 'casual_plus_registered'] = 0
# result.head(10)

submission = result[['datetime','casual_plus_registered']]
submission.rename(columns={'casual_plus_registered':'count'},inplace=True)
submission.to_csv('output/submission.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [439]:
submission.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,23
1,2011-01-20 01:00:00,10
2,2011-01-20 02:00:00,7
3,2011-01-20 03:00:00,7
4,2011-01-20 04:00:00,6
