In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('apple_quality.csv')
df.head(5)

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.491590483,good
1,1.0,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809367,good
2,2.0,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636473,bad
3,3.0,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723217,good
4,4.0,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984036,good


In [3]:
mas = []

for val in df['Quality']:
    if val == 'good':
        mas.append(1)
        
    else:
        mas.append(0)
        
df['bool_quality'] = mas

df.corr()

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,bool_quality
A_id,1.0,-0.028911,-0.00573,-0.002378,-0.013111,0.006179,0.000742,0.004875
Size,-0.028911,1.0,-0.170702,-0.32468,0.169868,-0.018892,-0.134773,0.244007
Weight,-0.00573,-0.170702,1.0,-0.154246,-0.095882,-0.092263,-0.243824,0.001421
Sweetness,-0.002378,-0.32468,-0.154246,1.0,-0.037552,0.095882,-0.2738,0.250998
Crunchiness,-0.013111,0.169868,-0.095882,-0.037552,1.0,-0.259607,-0.201982,-0.012376
Juiciness,0.006179,-0.018892,-0.092263,0.095882,-0.259607,1.0,-0.097144,0.260223
Ripeness,0.000742,-0.134773,-0.243824,-0.2738,-0.201982,-0.097144,1.0,-0.264315
bool_quality,0.004875,0.244007,0.001421,0.250998,-0.012376,0.260223,-0.264315,1.0


In [4]:
df.drop(['Weight', 'Crunchiness', 'Quality', 'A_id'], axis=1, inplace=True)
df.head(5)

Unnamed: 0,Size,Sweetness,Juiciness,Ripeness,Acidity,bool_quality
0,-3.970049,5.34633,1.8449,0.32984,-0.491590483,1
1,-1.195217,3.664059,0.853286,0.86753,-0.722809367,1
2,-0.292024,-1.738429,2.838636,-0.038033,2.621636473,0
3,-0.657196,1.324874,3.63797,-3.413761,0.790723217,1
4,1.364217,-0.384658,3.030874,-1.303849,0.501984036,1


In [5]:
df.drop([4000], axis=0, inplace=True)
df.tail(5)

Unnamed: 0,Size,Sweetness,Juiciness,Ripeness,Acidity,bool_quality
3995,0.059386,-3.714549,1.697986,2.244055,0.137784369,0
3996,-0.293118,-0.20402,0.024523,-1.0879,1.854235285,1
3997,-2.634515,-2.440461,2.199709,4.763859,-1.334611391,0
3998,-4.008004,2.366397,2.161435,0.214488,-2.229719806,1
3999,0.27854,0.121217,1.266677,-0.776571,1.599796456,1


In [6]:
df['Acidity'] = df['Acidity'].astype(np.float64)

In [7]:
split = int(len(df) * 0.8)

train = df[:split]
test = df[split:]

In [8]:
params = {
    'eta': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    "device": "cuda",
    
    'gamma': 0,
    'lambda': 0,
    'alpha': 0,
    'min_child_weight': 0,
    
    'eval_metric': 'auc',
    'objective': 'binary:logistic' ,
    'booster': 'gbtree',
    'njobs': -1,
    'tree_method': 'approx',
}

In [9]:
res = xgb.cv(params, xgb.DMatrix(train.drop(['bool_quality'], axis=1), train['bool_quality']),
                  early_stopping_rounds=10, maximize=True, 
                  num_boost_round=10000, nfold=5, stratified=True)

print(res)

Parameters: { "njobs" } are not used.



     train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0          0.763380       0.007375       0.747014      0.026512
1          0.784617       0.010885       0.766716      0.027902
2          0.827370       0.007454       0.808645      0.021338
3          0.843407       0.010849       0.819813      0.021357
4          0.847481       0.011087       0.824279      0.020731
..              ...            ...            ...           ...
187        0.969942       0.001294       0.918223      0.007894
188        0.970075       0.001280       0.918117      0.007985
189        0.970321       0.001235       0.918330      0.008069
190        0.970411       0.001260       0.918389      0.008016
191        0.970566       0.001214       0.918469      0.008243

[192 rows x 4 columns]


In [10]:
most_wanted = res['test-auc-mean'].argmax()
most_wanted

191

In [11]:
model = xgb.train(params, xgb.DMatrix(train.drop(['bool_quality'], axis=1).values, train['bool_quality'], feature_names=list(train.drop(['bool_quality'], axis=1).columns)), 
                    num_boost_round=most_wanted, maximize=True)

Parameters: { "njobs" } are not used.



In [12]:
y_pred = model.predict(xgb.DMatrix(test.drop(['bool_quality'], axis=1).values, feature_names=list(train.drop(['bool_quality'], axis=1).columns)))

In [13]:
val = roc_auc_score(test['bool_quality'], y_pred)
print(val)

0.9166604160416042
