In [1]:
import random
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import torch,torchvision
from torch.nn import *
from torch.optim import *
# Preproccessing
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    MaxAbsScaler,
    OneHotEncoder,
    Normalizer,
    Binarizer
)
# Decomposition
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
# Feature Selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
# Model Eval
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score,precision_score,f1_score,recall_score
# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier,VotingClassifier,BaggingClassifier,RandomForestRegressor
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoost,CatBoostClassifier
from xgboost import XGBClassifier,XGBRFClassifier
from flaml import AutoML
# Other
import pickle
import wandb

PROJECT_NAME = 'Titanic-V6'
device = 'cuda'
np.random.seed(65)
random.seed(65)
torch.manual_seed(65)

<torch._C.Generator at 0x7fb51e859890>

In [2]:
pd.read_csv('./data/test.csv')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [3]:
def save_model(model,name):
    pickle.dump(model,open(f'./models/{name}.pkl','wb'))
    pickle.dump(model,open(f'./models/{name}.pk','wb'))
def make_submission(model,name):
    data = pd.read_csv('./data/test.csv')
    ids = data['PassengerId']
    new_ticket = []
    tickets = data['Ticket']
    for ticket in tickets:
        ticket = ticket.split(' ')
        try:
            ticket = int(ticket[0])
        except:
            try:
                ticket = int(ticket[1])    
            except:
                try:
                    ticket = int(ticket[2])
                except:
                    ticket = 0
        new_ticket.append(ticket)
    data['Ticket'] = new_ticket
    new_names = []
    names = data['Name']
    for name in names:
        name = name.split(' ')[1].replace('.','')
        new_names.append(name)
        new_ticket.append(ticket)
    cabins = data['Cabin']
    new_cabins = []
    for cabin in cabins:
        try:
            cabin = cabin[:1]
            new_cabins.append(cabin)
        except:
            new_cabins.append(5000)
    del data['Cabin']
    data['Cabins'] = new_cabins
    data,_,new_data,idx,labels = object_to_int(data,'Cabins')
    data,_,new_data,idx,labels = object_to_int(data,'Name')
    data['Cabins'].replace({0:np.nan},inplace=True)
    data['Cabins'].fillna(data['Cabins'].median(),inplace=True)
    data,_,new_data,idx,labels = object_to_int(data,'Embarked')
    data,_,new_data,idx,labels = object_to_int(data,'Age')
    data,_,new_data,idx,labels = object_to_int(data,'Sex')
    data['Age'].fillna(data['Age'].median(),inplace=True)
    data['Fare'].fillna(data['Fare'].median(),inplace=True)
    data = data.astype(float)
    print(data.isna().sum())
    preds = model.predict(data)
    df = pd.DataFrame({'PassengerId':ids,'Survived':preds})
    df.to_csv(f'./submission/{name}.csv',index=False)
def valid(model,X,y,valid=False):
    preds = model.predict(X)
    if valid is False:
        result = {
            'Accuracy':accuracy_score(y_true=y,y_pred=preds),
            'Precision':precision_score(y_true=y,y_pred=preds),
            'F1':f1_score(y_true=y,y_pred=preds),
            'Recall':recall_score(y_true=y,y_pred=preds)
        }
    else:
        result = {
            'Val Accuracy':accuracy_score(y_true=y,y_pred=preds),
            'Val Precision':precision_score(y_true=y,y_pred=preds),
            'Val F1':f1_score(y_true=y,y_pred=preds),
            'Val Recall':recall_score(y_true=y,y_pred=preds)
        }
    return result
def train(model,X_train,X_test,y_train,y_test,name):
    wandb.init(project=PROJECT_NAME,name=name)
    model.fit(X_train,y_train)
    wandb.log(valid(model,X_test,y_test,True))
    wandb.log(valid(model,X_train,y_train,False))
    make_submission(model,name)
    save_model(model,name)
    wandb.finish()
def fe(data,col):
    max_num = data[col].quantile(0.99)
    min_num = data[col].quantile(0.05)
    data = data[data[col] > max_num]
    data = data[data[col] > min_num]
    return data

In [4]:
def object_to_int(data,col):
    old_data = data.copy()
    data = data[col].tolist()
    labels = {}
    idx = -1
    new_data = []
    for data_iter in data:
        if data_iter not in list(labels.keys()):
            idx += 1
            labels[data_iter] = idx
    for data_iter in data:
        new_data.append(labels[data_iter])
    old_data[col] = new_data
    return old_data,old_data[col],new_data,idx,labels

In [5]:
data = pd.read_csv('./data/train.csv')

In [6]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
old_data = data.copy()

In [8]:
new_ticket = []
tickets = data['Ticket']
for ticket in tickets:
    ticket = ticket.split(' ')
    try:
        ticket = int(ticket[0])
    except:
        try:
            ticket = int(ticket[1])    
        except:
            try:
                ticket = int(ticket[2])
            except:
                ticket = 0
    new_ticket.append(ticket)

In [9]:
data['Ticket'] = new_ticket

In [10]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
new_names = []
names = data['Name']
for name in names:
    name = name.split(' ')[1].replace('.','')
    new_names.append(name)

In [12]:
data['Name'] = new_names

In [13]:
data,_,new_data,idx,labels = object_to_int(data,'Name')

In [14]:
cabins = data['Cabin']

In [15]:
new_cabins = []
for cabin in cabins:
    try:
        cabin = cabin[:1]
        new_cabins.append(cabin)
    except:
        new_cabins.append(5000)

In [16]:
del data['Cabin']
data['Cabins'] = new_cabins

In [17]:
data,_,new_data,idx,labels = object_to_int(data,'Cabins')

In [18]:
labels

{5000: 0, 'C': 1, 'E': 2, 'G': 3, 'D': 4, 'A': 5, 'B': 6, 'F': 7, 'T': 8}

In [19]:
data['Cabins'].replace({0:np.nan},inplace=True)

In [20]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
Cabins         687
dtype: int64

In [21]:
data['Cabins'].isna().sum()

687

In [22]:
data['Cabins'].fillna(data['Cabins'].median(),inplace=True)

In [23]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
Cabins           0
dtype: int64

In [24]:
data,_,new_data,idx,labels = object_to_int(data,'Embarked')
data,_,new_data,idx,labels = object_to_int(data,'Age')

In [25]:
data['Cabins'].fillna(data['Cabins'].median(),inplace=True)

In [26]:
data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Cabins         0
dtype: int64

In [27]:
data,_,new_data,idx,labels = object_to_int(data,'Embarked')
data,_,new_data,idx,labels = object_to_int(data,'Age')
data,_,new_data,idx,labels = object_to_int(data,'Sex')
data = data.astype(float)

In [28]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Cabins
0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,21171.0,7.25,0.0,4.0
1,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,17599.0,71.2833,1.0,1.0
2,3.0,1.0,3.0,2.0,1.0,2.0,0.0,0.0,3101282.0,7.925,0.0,4.0
3,4.0,1.0,1.0,1.0,1.0,3.0,1.0,0.0,113803.0,53.1,0.0,1.0
4,5.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,373450.0,8.05,0.0,4.0


In [29]:
data = data.astype(float)

In [30]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Cabins
0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,21171.0,7.25,0.0,4.0
1,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,17599.0,71.2833,1.0,1.0
2,3.0,1.0,3.0,2.0,1.0,2.0,0.0,0.0,3101282.0,7.925,0.0,4.0
3,4.0,1.0,1.0,1.0,1.0,3.0,1.0,0.0,113803.0,53.1,0.0,1.0
4,5.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,373450.0,8.05,0.0,4.0


In [31]:
X = data.drop('Survived',axis=1)
y = data['Survived']

In [32]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.125)

In [33]:
train(GradientBoostingClassifier(),X_train,X_test,y_train,y_test,name='baseline')

[34m[1mwandb[0m: Currently logged in as: [33mranuga-d[0m (use `wandb login --relogin` to force relogin)


PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Embarked       0
Cabins         0
dtype: int64


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').