In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import dill

In [2]:
data = pd.read_csv("data/train.csv")
data.drop(["passengerid","name","ticket","cabin"],inplace=True,axis=1)

In [3]:
data.dtypes

survived      int64
pclass        int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
dtype: object

In [4]:
sex_encoder = preprocessing.LabelEncoder()
embarked_encoder = preprocessing.LabelEncoder()

In [5]:
sex_encoder.fit(data['sex'].astype(str))
embarked_encoder.fit(data['embarked'].astype(str))

LabelEncoder()

In [6]:
data['sex'] = sex_encoder.transform(data['sex'].astype(str))
data['embarked'] = embarked_encoder.transform(data['embarked'].astype(str))

In [7]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [8]:
y = data['survived']
X = data[[x for x in data.columns if x not in ['survived']]]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [10]:
import xgboost as xgb

In [11]:
train_dmatrix = xgb.DMatrix(data=X_train,label=y_train,feature_names=X_train.columns)
test_dmatrix = xgb.DMatrix(data=X_test,label=y_test,feature_names=X_test.columns)

In [12]:
xgboost = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

In [13]:
xgboost.fit(X_train,y_train)

XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [14]:
preds = xgboost.predict(X_test)

  if diff:


In [15]:
from sklearn import metrics

In [16]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds)

In [17]:
metrics.auc(fpr, tpr)

0.8216686674669867

In [20]:
### save model and everything with dill
model = {
    "encoders":[
        ("sex",sex_encoder),
        ("embarked",embarked_encoder)
    ],
    "model": xgboost,
    "col_names":X_train.columns.tolist()
}

In [21]:
with open('models/model.dill', 'wb') as file:
    dill.dump(model, file)

In [22]:
### open dill file to check if everything is there

In [23]:
with open("models/model.dill", 'rb') as file:
    loaded_model = dill.load(file)

In [24]:
loaded_model

{'encoders': [('sex', LabelEncoder()), ('embarked', LabelEncoder())],
 'model': XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=0.3, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=5, min_child_weight=1, missing=nan, n_estimators=100,
        n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1),
 'col_names': ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']}