In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import root_mean_squared_error
import lightgbm as lgb
import pickle

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [35]:
df_train = pd.read_csv('./data/clean/diamondsdlfe2xso37.csv',)
df_test = pd.read_csv('./data/clean/testdl123fexydatdso3.7x2.csv',)

y = df_train['price'].values[1:]
X = np.array(df_train.drop(['Unnamed: 0','price'], axis=1))[1:]
X_test = np.array(df_test.drop('Unnamed: 0',axis=1))[1:]

In [36]:
df_train.columns

Index(['Unnamed: 0', 'price', 'carat', 'depth', 'table', 'x', 'y', 'z', 'cut',
       'color', 'clarity', 'city', 'x/y', 'td', 'ad'],
      dtype='object')

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=62)

In [16]:
X_train

array([['0.29', '62.7', '62.0', ..., '0.9905437352245863',
        '0.9888357256778308', '0.0061978302977642825'],
       ['1.52', '61.2', '60.0', ..., '1.0137174211248285',
        '0.9803921568627451', '0.006283842340892668'],
       ['0.3', '61.9', '58.0', ..., '1.0070093457943923',
        '0.9369951534733442', '0.006113903489343302'],
       ...,
       ['0.5', '61.8', '59.0', ..., '1.0079522862823063',
        '0.9546925566343043', '0.006284047598292306'],
       ['1.35', '60.0', '57.0', ..., '1.0055944055944055', '0.95',
        '0.006107034602684245'],
       ['1.22', '62.0', '54.0', ..., '0.9971014492753623',
        '0.8709677419354839', '0.0060185853916895375']], dtype=object)

In [17]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
lgb_X = lgb.Dataset(X)

In [39]:
lgb_model = lgb.LGBMRegressor(
    task = 'predict',
    application = 'regression',
    objective = 'root_mean_squared_error',
    boosting_type="gbdt",
    num_iterations = 2500,
    learning_rate = 0.05,
    num_leaves=15,
    tree_learner='feature',
    max_depth =10,
    min_data_in_leaf=7,
    bagging_fraction = 1,
    bagging_freq = 100,
    reg_sqrt='True',
    metric ='rmse',
    feature_fraction = 0.6,
    random_state=42)

In [40]:
lgb_model.fit(X_train,y_train) 


preds_lgb_model = lgb_model.predict(X_val)
rmse_lgb = root_mean_squared_error(y_val, preds_lgb_model)
print(" RMSE: %f" % (rmse_lgb ))



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1975
[LightGBM] [Info] Number of data points in the train set: 32364, number of used features: 12
[LightGBM] [Info] Start training from score 55.712754
 RMSE: 520.319498




In [31]:
print('Starting training...')
# train
gbm = lgb.train(
                lgb_train,
                num_boost_round=200,
                valid_sets=lgb_eval,
                callbacks=[lgb.early_stopping(stopping_rounds=15)])

Starting training...


AttributeError: 'LGBMRegressor' object has no attribute 'train'

In [70]:
gridParams_ext = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8,16],
    'num_leaves': [6,8], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
    'objective' : ['regression'],
    'max_bin':[255], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],
    'colsample_bytree' : [0.65],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2],
    'metric': ['rmse'],
    'force_col_wise': [True]}
gridParams_quick = {
    'task' : ['predict'],
    'application' : ['regression'],
    'objective' : ['root_mean_squared_error'],
    'boosting_type' : ["gbdt"],
    'num_iterations' : [2500],
    'learning_rate' : [0.05],
    'num_leaves' : [15],
    'tree_learner' : ['feature'],
    'max_depth' : [10],
    'min_data_in_leaf':[7],
    'bagging_fraction' : [1],
    'bagging_freq' : [100],
    'reg_sqrt' : ['True'],
    'metric' : ['rmse'],
    'feature_fraction' : [0.6],
    'force_col_wise': [True]}
gridParams_art={
        'num_leaves':[64],
          'min_child_samples':[6],
          'objective':['regression'],
          'learning_rate':[0.01],
          'boosting_type':['gbdt'],
          'metric':['rmse'],
          'max_depth':[6]}
gridParams_xgb= {'colsample_bytree': [0.8,0.9], 'learning_rate': [0.005,0.015], 'max_depth': [6,7,8], 'n_estimators': [800,1600], 'subsample': [0.6,0.9],
    'force_col_wise': [True]}

In [71]:
mdl = lgb.LGBMRegressor()

In [72]:
grid = GridSearchCV(mdl,gridParams_xgb, verbose=-1, cv=3, n_jobs=-1, scoring='neg_root_mean_squared_error')
# Run the grid
grid.fit(X, y)

best_params = grid.best_params_
best_score = grid.best_score_

print("Best parameters found: ", best_params)
print("Best score found: ", best_score)


Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV 2/3; 1/48] START colsample_bytree=0.8, force_col_wise=True, learning_rate=0.005, max_depth=6, n_estimators=800, subsample=0.6
[CV 2/3; 4/48] START colsample_bytree=0.8, force_col_wise=True, learning_rate=0.005, max_depth=6, n_estimators=1600, subsample=0.9
[CV 3/3; 3/48] START colsample_bytree=0.8, force_col_wise=True, learning_rate=0.005, max_depth=6, n_estimators=1600, subsample=0.6
[CV 1/3; 1/48] START colsample_bytree=0.8, force_col_wise=True, learning_rate=0.005, max_depth=6, n_estimators=800, subsample=0.6
[CV 3/3; 2/48] START colsample_bytree=0.8, force_col_wise=True, learning_rate=0.005, max_depth=6, n_estimators=800, subsample=0.9
[CV 1/3; 2/48] START colsample_bytree=0.8, force_col_wise=True, learning_rate=0.005, max_depth=6, n_estimators=800, subsample=0.9
[CV 2/3; 6/48] START colsample_bytree=0.8, force_col_wise=True, learning_rate=0.005, max_depth=7, n_estimators=800, subsample=0.9[CV 3/3; 1/48] START colsam

In [None]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(np.array(X_val))
# eval
rmse_test = root_mean_squared_error(y_val, y_pred) 
print(f'The RMSE of prediction is: {rmse_test}')

Starting predicting...
The RMSE of prediction is: 520.4485025239875


In [151]:
preds = gbm.predict(np.array(X_test))
output = pd.DataFrame({'id': list(range(len(preds))),
                       'price': preds.squeeze()})

In [152]:
output

Unnamed: 0,id,price
0,0,2866.552767
1,1,5711.512721
2,2,9709.331458
3,3,4027.653518
4,4,1567.500332
...,...,...
13480,13480,1674.217339
13481,13481,2437.809262
13482,13482,3044.305921
13483,13483,2068.585185


In [153]:
output.to_csv('./submissions/lgbdefdl123fexydatdso3x2.csv', index=False)

In [155]:
gbm.params

{'boosting_type': 'gbdt',
 'objective': 'regression',
 'metric': 'rmse',
 'num_iterations': 200}

In [95]:
train_data = lgb.Dataset('./data/clean/diamondsdlfe2xso.csv', label='price',feature_name=['price', 'carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color',
       'clarity', 'city', 'x/y', 'td', 'ad'], categorical_feature= ['cut', 'color','clarity'])

In [86]:
train_data

<lightgbm.basic.Dataset at 0x7f99568b7670>

In [87]:
validation_data = train_data.create_valid('testdl123fexydatdso3x2.csv')

In [88]:
param = {'metric': 'rmse'}
num_round = 10

In [25]:
bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data])

[LightGBM] [Info] Construct bin mappers from text data time 0.02 seconds


TypeError: Wrong type(str) for label.
It should be list, numpy 1-D array or pandas Series