In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.drop(['id', 'hospital_number', 'abdomen'], axis = 1, inplace = True)

In [4]:
def clean_data(clean_data):
    clean_data['temp_of_extremities'].fillna('cool', inplace = True)
    clean_data['mucous_membrane'].fillna('other', inplace = True)
    clean_data['mucous_membrane'] = clean_data['mucous_membrane'].apply(lambda x: 'other' if x in ['absent', 'increased'] else x)
    clean_data['capillary_refill_time'].fillna('less_3_sec', inplace = True)
    clean_data['capillary_refill_time'] = clean_data['capillary_refill_time'].str.replace(pat = '3', repl = 'less_3_sec')
    clean_data['pain'].fillna('pain', inplace = True)
    clean_data['pain'] = clean_data['pain'].str.replace(pat = 'slight', repl = 'severe_pain')
    clean_data['peristalsis'].fillna('absent', inplace = True)
    clean_data['abdominal_distention'].fillna('severe', inplace = True)
    clean_data['nasogastric_tube'].fillna('missing', inplace = True)
    clean_data['nasogastric_reflux'] = clean_data['nasogastric_reflux'].str.replace(pat = 'slight', repl = 'missing')
    clean_data['nasogastric_reflux'].fillna('missing', inplace = True)
    clean_data['rectal_exam_feces'] = clean_data['rectal_exam_feces'].str.replace(pat = 'serosanguious', repl = 'missing')
    clean_data['rectal_exam_feces'].fillna('missing', inplace = True)
    clean_data['abdomo_appearance'].fillna('missing', inplace = True)

    return clean_data

In [5]:
data = clean_data(data)

data.drop(columns = ['pain', 'age' , 'nasogastric_reflux', 'lesion_1', 'lesion_2', 'lesion_3', 'rectal_temp', 'mucous_membrane', 'cp_data', 'nasogastric_tube'], inplace = True)




float_columns = data.select_dtypes(include=['float64']).columns
object_columns = data.select_dtypes(include=['object']).columns

for column in float_columns:
    data[column].fillna(data[column].median(skipna = True), inplace = True)

minmax_scaler = MinMaxScaler()
minmax_scaler.fit(data[float_columns])
data[float_columns] = minmax_scaler.transform(data[float_columns])

label_encoders = {x: LabelEncoder() for x in object_columns}

for column in object_columns:
    label_encoders[column].fit(data[column])
    data[column] = label_encoders[column].transform(data[column])

y = data['outcome']
data.drop(columns = ['outcome'], inplace = True)
data = pd.get_dummies(data, columns = object_columns.drop('outcome'), dtype=np.int8)

In [6]:
data['outcome'] = y
y_to_oversample = data.query('outcome != 2')['outcome']
features_to_oversample = data.query('outcome != 2').drop('outcome', axis = 1)
ros = RandomOverSampler(random_state=42)

X_res, y_res = ros.fit_resample(features_to_oversample, y_to_oversample)

data.query('outcome == 2', inplace = True)
X_res['outcome'] = y_res
data = pd.concat([data, X_res])
data.to_csv('train_data_to_use.csv', index = False)

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_score, accuracy_score, f1_score, classification_report

In [11]:
y = data['outcome']
data.drop(columns = ['outcome'], inplace = True)

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)
#Because of the low number of rows, bootstrap will be set to False.

custom_scorer = make_scorer(f1_score, greater_is_better=True)
param_grid = {'n_estimators': [50, 100, 150, 200, 250, 300],  # being the number of trees in the forest.
                  'min_samples_leaf': [3, 5, 10, 20],  # number of minimum samples required at a leaf node.
                  'min_samples_split': [3, 6, 9],  # number of minimum samples required to split an internal node.
                  'criterion': ['entropy'],  # measures the quality of a split. Can use gini's impurity or entropy.
                  # 'subsample':[0.5,0.8,1]#buscar con mas detalle
                  # 'reg_lambda':[1,10,100]#buscar con mas detalle
                  }

clf = GridSearchCV(
# Evaluates the performance of different groups of parameters for a model based on cross-validation.
    RandomForestClassifier(class_weight='balanced', bootstrap=False, random_state=1234),
    param_grid,  # dict of parameters.
    cv=10,  # Specified number of folds in the Cross-Validation(K-Fold).
    scoring='f1_micro')
        # clf = GridSearchCV(DecisionTreeClassifier(max_leaf_nodes=3, random_state=0), param_grid2)
clf.fit(X_train, y_train)
        # print("Best estimator found by grid search:")
        # print(clf.best_estimator_)
model = clf.best_estimator_  # Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data        model.fit(X_train, y_train)


print('RandomForest:')
y_pred_rf = model.predict(X_test)
print(classification_report(y_pred_rf, y_test))
print('-------------------------------')




RandomForest:
              precision    recall  f1-score   support

           0       0.78      0.64      0.70        99
           1       0.82      0.89      0.86        84
           2       0.70      0.78      0.74        96

    accuracy                           0.76       279
   macro avg       0.77      0.77      0.77       279
weighted avg       0.77      0.76      0.76       279

-------------------------------


### XGBRegressor

In [None]:
import xgboost as xgb
from scipy.stats import uniform, randint
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV

def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 300), # default 100
    "subsample": uniform(0.6, 0.4)
}

xgb_model = xgb.XGBRegressor()

search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)

search.fit(X_train, y_train)

In [None]:
report_best_scores(search.cv_results_, 1)

In [None]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

print(classification_report(y_test, y_pred))

### Test section

In [None]:
test_data = pd.read_csv('test.csv')

In [None]:
ids = test_data['id']
test_data.drop(['id', 'hospital_number', 'abdomen'], axis = 1, inplace = True)
clean_test_data = clean_data(test_data)
clean_test_data['pain'] = clean_test_data['pain'].apply(lambda x: 'pain' if x == 'moderate' else x)

In [None]:
float_columns = clean_test_data.select_dtypes(include=['float64']).columns
object_columns = clean_test_data.select_dtypes(include=['object']).columns

In [None]:
clean_test_data[float_columns] = minmax_scaler.transform(clean_test_data[float_columns])
for column in object_columns:
    #print(column)
    clean_test_data[column] = label_encoders[column].transform(clean_test_data[column])

In [None]:
y_hat = rf.predict(clean_test_data).astype(str)

In [None]:
y_hat[y_hat == '0'] = 'died'
y_hat[y_hat == '1'] = 'euthanized'
y_hat[y_hat == '2'] = 'lived'

In [None]:
submission = pd.DataFrame({'id':ids, 'outcome':y_hat})
submission.to_csv('submission.csv', index = False)