In [44]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [34]:
scaler = MinMaxScaler()

In [38]:
X_train = pd.get_dummies(df_train, columns=['Employment Type', 'GraduateOrNot', 'ChronicDiseases', 'FrequentFlyer', 'EverTravelledAbroad']).drop(['Customer Id', 'TravelInsurance'], axis=1)
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_train

array([[0.9       , 0.66666667, 0.57142857, ..., 1.        , 1.        ,
        0.        ],
       [0.3       , 0.3       , 0.71428571, ..., 1.        , 1.        ,
        0.        ],
       [0.3       , 0.3       , 0.57142857, ..., 1.        , 1.        ,
        0.        ],
       ...,
       [0.9       , 0.66666667, 0.28571429, ..., 0.        , 1.        ,
        0.        ],
       [0.8       , 0.06666667, 0.42857143, ..., 0.        , 1.        ,
        0.        ],
       [0.3       , 0.3       , 0.42857143, ..., 1.        , 1.        ,
        0.        ]])

In [69]:
X_test = pd.get_dummies(df_test, columns=['Employment Type', 'GraduateOrNot', 'ChronicDiseases', 'FrequentFlyer', 'EverTravelledAbroad']).drop(['Customer Id'], axis=1)
scaler.transform(X_test)
X_test = scaler.transform(X_test)
X_test

array([[0.9       , 0.66666667, 0.71428571, ..., 0.        , 1.        ,
        0.        ],
       [0.3       , 0.76666667, 0.28571429, ..., 1.        , 0.        ,
        1.        ],
       [0.3       , 0.4       , 1.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.3       , 0.6       , 0.42857143, ..., 0.        , 1.        ,
        0.        ],
       [0.3       , 0.16666667, 0.28571429, ..., 0.        , 1.        ,
        0.        ],
       [0.3       , 0.6       , 0.42857143, ..., 0.        , 1.        ,
        0.        ]])

In [70]:
y_train = pd.get_dummies(df_train, columns=['TravelInsurance'])['TravelInsurance_Yes']
y_train

0       0
1       0
2       0
3       0
4       0
       ..
1585    0
1586    0
1587    0
1588    0
1589    0
Name: TravelInsurance_Yes, Length: 1590, dtype: uint8

In [94]:
masks = SelectKBest(chi2, k=8).fit(X_train, y_train).get_support()
X_train_k = X_train[:, masks]
X_test_k = X_test[:, masks]
X_train_k

array([[0.9       , 0.66666667, 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.3       , 0.3       , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.3       , 0.3       , 0.        , ..., 1.        , 1.        ,
        0.        ],
       ...,
       [0.9       , 0.66666667, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.8       , 0.06666667, 1.        , ..., 0.        , 1.        ,
        0.        ],
       [0.3       , 0.3       , 0.        , ..., 1.        , 1.        ,
        0.        ]])

In [127]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=16, random_state=0).fit(X_train, y_train)
# clf = LogisticRegression(random_state=0).fit(X_train_k, y_train)
# clf = MultinomialNB().fit(X_train, y_train)

In [128]:
clf.score(X_train, y_train)

0.9251572327044025

In [129]:
prediction = clf.predict_proba(X_test)[:, 1]
prediction

array([8.86666667e-01, 1.00000000e+00, 2.52000000e-01, 9.80000000e-01,
       1.02901070e-01, 1.70000000e-01, 6.23425926e-01, 3.47916667e-01,
       2.54025974e-02, 3.22857143e-02, 3.37285714e-01, 7.82224192e-02,
       1.00000000e-02, 7.37305556e-01, 3.00000000e-03, 9.33333333e-02,
       1.97921717e-01, 1.00000000e+00, 1.00000000e+00, 2.09152047e-02,
       4.52500000e-01, 4.76190476e-02, 1.46666667e-02, 1.44619048e-01,
       0.00000000e+00, 3.88756612e-01, 1.61666667e-01, 9.58333333e-02,
       1.00000000e+00, 5.44832324e-02, 1.67619048e-02, 2.32500000e-01,
       9.60000000e-01, 1.00000000e+00, 1.69666667e-01, 3.90213310e-01,
       6.82666667e-01, 1.00000000e+00, 1.00000000e+00, 4.00000000e-02,
       7.10000000e-01, 1.46666667e-02, 5.57549949e-02, 2.00000000e-02,
       1.95479798e-01, 9.90000000e-01, 6.73333333e-01, 3.46619048e-01,
       2.73333333e-01, 3.00000000e-03, 4.00000000e-02, 7.72380952e-01,
       1.59396511e-01, 2.45873016e-01, 2.96687332e-01, 1.79563492e-02,
      

In [130]:
res = pd.DataFrame()
res['Customer Id'] = df_test['Customer Id']
res['prediction'] = prediction
res.to_csv('output.csv', index=False)