In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

X = train_df.drop(['prognosis'], axis=1)
y = train_df['prognosis']

# get the unique values in the 'prognosis' column as features
features = train_df['prognosis'].unique()

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [14]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, objective="multi:softmax", eval_metric="mlogloss", random_state=42)

# Set up the hyperparameter grid for the Grid Search
#Best Parameters: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.7}

param_grid = {
    'learning_rate': [0.01],
    'max_depth': [5,6,7],
    'n_estimators': [150, 200, 250],
    'subsample': [0.5, 0.7, 0.9],
    'colsample_bytree': [0.4,0.5,0.6],
    'gamma': [0, 0.025, 0.05]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, scoring='accuracy', n_jobs=-1, cv=5, verbose=3)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)
results = grid_search.cv_results_
pd.DataFrame(results).to_csv('grid_search_runs_2.csv')
# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Train the model with the best parameters
best_xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, objective="multi:softmax", eval_metric="mlogloss", random_state=42, **best_params)
best_xgb_classifier.fit(X_train, y_train)

# Make predictions
y_pred = best_xgb_classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", round(accuracy, 4))

Fitting 5 folds for each of 243 candidates, totalling 1215 fits




[CV 1/5] END colsample_bytree=0.4, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=150, subsample=0.5;, score=0.283 total time=   1.2s
[CV 2/5] END colsample_bytree=0.4, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=150, subsample=0.5;, score=0.274 total time=   1.3s
[CV 3/5] END colsample_bytree=0.4, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=150, subsample=0.5;, score=0.354 total time=   1.2s
[CV 5/5] END colsample_bytree=0.4, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=150, subsample=0.5;, score=0.336 total time=   1.2s
[CV 2/5] END colsample_bytree=0.4, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=150, subsample=0.7;, score=0.301 total time=   1.3s
[CV 1/5] END colsample_bytree=0.4, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=150, subsample=0.7;, score=0.292 total time=   1.3s
[CV 5/5] END colsample_bytree=0.4, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=150, subsample=0.7;, score=0.327 total time=   1.3s
[CV 3/



Accuracy: 0.3099


In [16]:
## Train with all data, use best params (round 2)
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

#use best params
best_xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, objective="multi:softmax", eval_metric="mlogloss", random_state=42, **best_params)
best_xgb_classifier.fit(X_train, y_train)

# Make predictions
y_pred = best_xgb_classifier.predict(X_test)

# create the submission DataFrame and save it to a CSV file
prognosis = pd.DataFrame(y_pred, columns=['prognosis'])
prognosis = prognosis.iloc[:len(y_pred)] 
submission = pd.concat([X_test['id'], prognosis], axis=1)
submission.to_csv('submission.csv', index=False)


