#### Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_absolute_error,accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from xgboost import XGBClassifier

#### Fetch and preprocess data using pipeline

In [None]:
data=pd.read_csv('training.csv')
data=data.iloc[:,:-1]
data.head()

In [None]:
data.describe()

In [None]:
temp=data.prognosis.unique()
k=0
m={}
for x in temp:
    m[x]=k
    k+=1

y=data.prognosis.copy()
for i in range(len(y)):
    y[i]=int(m[y[i]])

data.prognosis=y.astype(str).astype(int)

In [None]:
X=data.drop(['prognosis'],axis=1)
y=data['prognosis']
X_train,X_valid,y_train,y_valid=train_test_split(X,y,train_size=0.8,test_size=0.2)
y_valid=y_valid.astype(str).astype(int)

In [None]:
categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

In [None]:
numerical_transformer=Pipeline(steps=[
    ('impute',SimpleImputer(strategy='constant')),
    ('scale',StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

#### Define model and fit

In [None]:
model=XGBClassifier(use_label_encoder=False)

In [None]:
pipe=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('model',model)
])

In [None]:
pipe.fit(X_train,y_train)

#### Predict and validate

In [None]:
pred=pipe.predict(X_valid)

In [None]:
print("Accuracy: %.2f%%" % (accuracy_score(y_valid,pred) * 100.0))

In [None]:
plt.bar(range(len(model.feature_importances_)), model.feature_importances_)
plt.show()

In [None]:
from xgboost import plot_importance
plot_importance(model)
plt.show()

#### Extract optimal parameters using feature selection and validate

In [None]:
good_features=[]
top_fs={}
fs=model.feature_importances_
for i in range(len(fs)):
#     print(X.columns[i],fs[i])
    if fs[i]>fs.mean():
        good_features.append(X.columns[i])
        top_fs[X.columns[i]]=fs[i]

temp=sorted(top_fs.items(), key =
             lambda kv:(kv[1], kv[0])) 
custom_features=[]
for i in range(int(input('Enter number of features:'))):
    custom_features.append(temp[i][0])
custom_features

In [None]:
good_features=custom_features

In [None]:
opt_X=data[good_features]
opt_X.head()

In [None]:
opt_X.describe()

In [None]:
opt_y=data['prognosis']
opt_y

In [None]:
opt_X_train,opt_X_valid,opt_y_train,opt_y_valid=train_test_split(opt_X,opt_y,test_size=0.2,train_size=0.8)

In [None]:
model.fit(opt_X_train,opt_y_train)

In [None]:
opt_pred=model.predict(opt_X_valid)

In [None]:
print("Accuracy: %.2f%%" % (accuracy_score(opt_y_valid,opt_pred) * 100.0))

#### Final testing

In [None]:
testing=pd.read_csv('testing.csv')
testing.describe()

In [None]:
testing.head()

In [None]:
final_X=data.drop(['prognosis'],axis=1)
final_y=data['prognosis']
result=model.predict(final_X[good_features])

In [None]:
print("Accuracy: %.2f%%" % (accuracy_score(final_y,result) * 100.0))