In [156]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import root_mean_squared_error
import lightgbm as lgb
import pickle

In [169]:
df_train = pd.read_csv('./data/clean/diamondsdlfe2xso.csv', header=None)
df_test = pd.read_csv('./data/clean/testdl123fexydatdso3x2.csv', header=None)

y = df_train[1]
X = df_train.drop([0,1], axis=1)
X_test = df_test.drop(0,axis=1)

In [170]:
y

0        price
1         4268
2          505
3         2686
4          738
         ...  
40451    10070
40452    12615
40453     5457
40454      456
40455     6232
Name: 1, Length: 40456, dtype: object

In [171]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=62)

In [172]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
lgb_X = lgb.Dataset(X)

In [173]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse'}

In [174]:
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=200,
                valid_sets=lgb_eval,
                callbacks=[lgb.early_stopping(stopping_rounds=15)])

Starting training...


ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: 2: object, 3: object, 4: object, 5: object, 6: object, 7: object, 8: object, 9: object, 10: object, 11: object, 12: object, 13: object, 14: object

In [145]:
gridParams_ext = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8,16],
    'num_leaves': [6,8], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
    'objective' : ['regression'],
    'max_bin':[255], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],
    'colsample_bytree' : [0.65],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2],
    'metric': ['rmse'],
    'force_col_wise': [True]}
gridParams_quick = {
    'learning_rate': [ 0.01],
    'n_estimators': [8],
    'num_leaves': [6,8], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt'], # for better accuracy -> try dart
    'objective' : ['regression'],
    'max_bin':[255], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],
    'colsample_bytree' : [0.65],
    'subsample' : [0.7],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2],
    'metric': ['rmse'],
    'force_col_wise': [True]}

In [146]:
mdl = lgb.LGBMRegressor()

In [147]:
grid = GridSearchCV(mdl, gridParams_quick, verbose=10, cv=4, n_jobs=-1, scoring='neg_root_mean_squared_error')
# Run the grid
grid.fit(lgb_X, y.values)

best_params = grid.best_params_
best_score = grid.best_score_

print("Best parameters found: ", best_params)
print("Best score found: ", best_score)

# Evaluate the model on the test set using the best parameters
best_model = grid.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test score with best parameters: ", test_score)

TypeError: Singleton array array(<lightgbm.basic.Dataset object at 0x7f99c54f46a0>, dtype=object) cannot be considered a valid collection.

In [148]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(np.array(X_val))
# eval
rmse_test = root_mean_squared_error(y_val, y_pred) 
print(f'The RMSE of prediction is: {rmse_test}')

Starting predicting...
The RMSE of prediction is: 520.4485025239875


In [151]:
preds = gbm.predict(np.array(X_test))
output = pd.DataFrame({'id': list(range(len(preds))),
                       'price': preds.squeeze()})

In [152]:
output

Unnamed: 0,id,price
0,0,2866.552767
1,1,5711.512721
2,2,9709.331458
3,3,4027.653518
4,4,1567.500332
...,...,...
13480,13480,1674.217339
13481,13481,2437.809262
13482,13482,3044.305921
13483,13483,2068.585185


In [153]:
output.to_csv('./submissions/lgbdefdl123fexydatdso3x2.csv', index=False)

In [155]:
gbm.params

{'boosting_type': 'gbdt',
 'objective': 'regression',
 'metric': 'rmse',
 'num_iterations': 200}

In [95]:
train_data = lgb.Dataset('./data/clean/diamondsdlfe2xso.csv', label='price',feature_name=['price', 'carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color',
       'clarity', 'city', 'x/y', 'td', 'ad'], categorical_feature= ['cut', 'color','clarity'])

In [86]:
train_data

<lightgbm.basic.Dataset at 0x7f99568b7670>

In [87]:
validation_data = train_data.create_valid('testdl123fexydatdso3x2.csv')

In [88]:
param = {'metric': 'rmse'}
num_round = 10

In [25]:
bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data])

[LightGBM] [Info] Construct bin mappers from text data time 0.02 seconds


TypeError: Wrong type(str) for label.
It should be list, numpy 1-D array or pandas Series