In [7]:
import sklearn
import numpy as np
import scipy
import matplotlib.pyplot as plt
import pandas as pd

### Load Data

In [24]:
train = pd.read_csv("train.csv")

In [9]:
print(len(train))
train.head(10)

8693


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


In [37]:
X_t.Cabin.unique()

array(['B/0/P', 'F/0/S', 'A/0/S', ..., 'G/1499/S', 'G/1500/S', 'E/608/S'],
      dtype=object)

### Pre-Processing

In [46]:
from sklearn.preprocessing import LabelEncoder

# change data types and features to be more usable
# since it is mainly converting categorical to numerical and encoding,
# we do this before splitting to ensure features are consistent between training and validation
def data_cleaning(data):
    #fill nans
    data = data.bfill()

    # drop some features
    data = data.drop(["Name", "PassengerId"], axis=1)

    # split up the Cabin format into (deck/num/side
    # num varies from 0 to over 1000
    # side is either P or S
    cabin = data.Cabin.str.split('/')
    cabin = cabin.dropna()
    cabin = cabin.tolist()
    deck = [item[0] for item in cabin]
    num = [item[1] for item in cabin]
    side = [item[2] for item in cabin]

    data = data.drop(columns=['Cabin'], axis=1)
    data['Deck'] = deck
    data['Num'] = num
    data['Side'] = side
    
    # dummy encode over categorical features
    planet = pd.get_dummies(data['HomePlanet'])
    destination = pd.get_dummies(data['Destination'])
    deck = pd.get_dummies(data['Deck'])
    data = data.drop('HomePlanet',axis = 1)
    data = data.drop('Destination',axis = 1)
    data = data.drop('Deck',axis = 1)
    data = data.join(planet)
    data = data.join(destination)
    data = data.join(deck)
    
    #convert to numerical
    data['CryoSleep'] = data['CryoSleep'].astype(int)
    data['VIP'] = data['VIP'].astype(int)
    
    data['Side'] = LabelEncoder().fit_transform(data['Side'])
    
    return data

In [47]:
def feat_engineering(data):
    data.fillna(method = 'bfill', inplace=True)
    return data

In [75]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

scaler = StandardScaler()  

# split into X and y dataframes
y = train["Transported"]
X = train.drop(columns = ["Transported"])

# clean up feature types
X = data_cleaning(X)

# training and validation split
X_train, X_valid, y_train, y_valid = train_test_split(
     X, y, test_size=0.15, random_state = 42)

# feature engineering
X_train = feat_engineering(X_train)
X_valid = feat_engineering(X_valid)

# scaling, fit only on training data
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(scaler.transform(X_valid), columns=X_valid.columns)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


PCA(n_components=8)

In [76]:
X_valid

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Num,Side,...,PSO J318.5-22,TRAPPIST-1e,A,B,C,D,E,F,G,T
0,-0.746031,-0.677278,-0.151599,0.290415,-0.066995,0.768527,-0.273965,0.650800,-1.051917,0.993122,...,-0.320937,0.660629,-0.176406,-0.318365,-0.311616,-0.241439,-0.335859,1.425485,-0.658094,-0.028508
1,-0.746031,-0.746298,-0.151599,-0.324813,0.287204,-0.288153,-0.276571,-0.262218,-0.274972,0.993122,...,-0.320937,0.660629,-0.176406,-0.318365,-0.311616,-0.241439,-0.335859,-0.701516,1.519540,-0.028508
2,1.340427,0.841155,-0.151599,-0.330771,-0.289725,-0.288153,-0.276571,-0.263083,1.660540,0.993122,...,-0.320937,0.660629,-0.176406,-0.318365,-0.311616,-0.241439,-0.335859,-0.701516,1.519540,-0.028508
3,-0.746031,0.427036,-0.151599,-0.330771,-0.074015,0.438523,-0.242685,-0.263083,-0.605712,-1.006926,...,-0.320937,0.660629,-0.176406,-0.318365,-0.311616,-0.241439,-0.335859,-0.701516,1.519540,-0.028508
4,1.340427,0.979194,-0.151599,-0.330771,-0.289725,-0.288153,-0.276571,-0.263083,-0.611583,-1.006926,...,-0.320937,0.660629,-0.176406,-0.318365,-0.311616,4.141834,-0.335859,-0.701516,-0.658094,-0.028508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1299,1.340427,-0.332180,-0.151599,-0.330771,-0.289725,-0.288153,-0.276571,-0.263083,-0.574400,0.993122,...,-0.320937,-1.513709,-0.176406,3.141047,-0.311616,-0.241439,-0.335859,-0.701516,-0.658094,-0.028508
1300,-0.746031,-1.988652,-0.151599,-0.330771,-0.289725,-0.288153,-0.276571,-0.263083,-0.310199,0.993122,...,-0.320937,0.660629,-0.176406,-0.318365,-0.311616,-0.241439,-0.335859,-0.701516,1.519540,-0.028508
1301,1.340427,-1.643554,-0.151599,-0.330771,-0.289725,-0.288153,-0.276571,-0.263083,-1.091058,0.993122,...,-0.320937,-1.513709,5.668733,-0.318365,-0.311616,-0.241439,-0.335859,-0.701516,-0.658094,-0.028508
1302,-0.746031,1.876450,-0.151599,0.680704,0.428883,-0.288153,-0.247899,2.529578,-0.717263,0.993122,...,-0.320937,0.660629,-0.176406,3.141047,-0.311616,-0.241439,-0.335859,-0.701516,-0.658094,-0.028508


### Model

In [78]:
from sklearn.svm import SVC

svm = SVC(kernel = 'rbf', gamma = 'scale')

'''
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

svm_grid = GridSearchCV(svm, param_grid = param_grid, n_jobs = -1,
                                cv=5, return_train_score=True)


svm_grid.fit(X_train, y_train)

# Extract the scores
results = svm_grid.cv_results_
train_scores = results['mean_train_score']
test_scores = results['mean_test_score']
'''

svm.fit(X_train, y_train)

SVC()

In [79]:
from sklearn.metrics import accuracy_score
y_pred = svm.predict(X_valid)
score = accuracy_score(y_valid, y_pred)
score

0.7799079754601227

In [80]:
from sklearn.ensemble import RandomForestClassifier 

rfc = RandomForestClassifier()

# Fit the model to your training data.
rfc.fit(X_train, np.ravel(y_train))

RandomForestClassifier()

In [81]:
y_pred = rfc.predict(X_valid)
score = accuracy_score(y_valid, y_pred)
score

0.781441717791411

In [100]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [101]:
y_pred = xgb.predict(X_valid)
score = accuracy_score(y_valid, y_pred)
score

0.7707055214723927

### Submission

In [95]:
test = pd.read_csv("test.csv")
#save PassengerIDs for submission
IDs = test['PassengerId']

# clean up feature types
test = data_cleaning(test)

# feature engineering
test = feat_engineering(test)

# scaling, fit only on training data
test = pd.DataFrame(scaler.transform(test), columns=test.columns)

In [97]:
y_pred = svm.predict(test)

In [98]:
# build submission file
test["Transported"] = y_pred
test["PassengerID"] = IDs
submit = test[['PassengerID','Transported']]
submit

Unnamed: 0,PassengerID,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [99]:
submit.to_csv('submission.csv', index=False)