In [1]:
import random
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import torch,torchvision
from torch.nn import *
from torch.optim import *

# Model Eval
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score,precision_score,f1_score,recall_score
# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier,VotingClassifier,BaggingClassifier,RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoost,CatBoostClassifier
from xgboost import XGBClassifier,XGBRFClassifier
from flaml import AutoML
# Other
import pickle
import wandb

PROJECT_NAME = 'Titanic-V6'
device = 'cuda'
np.random.seed(65)
random.seed(65)
torch.manual_seed(65)

<torch._C.Generator at 0x7f7a09f63890>

In [2]:
pd.read_csv('./data/test.csv')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [3]:
def save_model(model,name):
    pickle.dump(model,open(f'./models/{name}.pkl','wb'))
    pickle.dump(model,open(f'./models/{name}.pk','wb'))
def make_submission(model,name):
    project_name = name
    data = pd.read_csv('./data/test.csv')
    ids = data['PassengerId']
    new_ticket = []
    tickets = data['Ticket']
    for ticket in tickets:
        ticket = ticket.split(' ')
        try:
            ticket = int(ticket[0])
        except:
            try:
                ticket = int(ticket[1])    
            except:
                try:
                    ticket = int(ticket[2])
                except:
                    ticket = 0
        new_ticket.append(ticket)
    data['Ticket'] = new_ticket
    new_names = []
    names = data['Name']
    for name in names:
        name = name.split(' ')[1].replace('.','')
        new_names.append(name)
        new_ticket.append(ticket)
    cabins = data['Cabin']
    new_cabins = []
    for cabin in cabins:
        try:
            cabin = cabin[:1]
            new_cabins.append(cabin)
        except:
            new_cabins.append(5000)
    del data['Cabin']
    data['Cabins'] = new_cabins
    data,_,new_data,idx,labels = object_to_int(data,'Cabins')
    data,_,new_data,idx,labels = object_to_int(data,'Name')
    data['Cabins'].replace({0:np.nan},inplace=True)
    data['Cabins'].fillna(data['Cabins'].median(),inplace=True)
    data,_,new_data,idx,labels = object_to_int(data,'Embarked')
    data,_,new_data,idx,labels = object_to_int(data,'Age')
    data,_,new_data,idx,labels = object_to_int(data,'Sex')
    data['Age'].fillna(data['Age'].median(),inplace=True)
    data['Fare'].fillna(data['Fare'].median(),inplace=True)
    name = project_name
    data = data.astype(float)
    preds = model.predict(data)
    df = pd.DataFrame({'PassengerId':ids,'Survived':preds.astype(int)}).to_csv(f'./submission/' + name + '.csv',index=False)
def valid(model,X,y,valid=False):
    preds = model.predict(X)
    if valid is False:
        result = {
            'Accuracy':accuracy_score(y_true=y,y_pred=preds),
            'Precision':precision_score(y_true=y,y_pred=preds),
            'F1':f1_score(y_true=y,y_pred=preds),
            'Recall':recall_score(y_true=y,y_pred=preds)
        }
    else:
        result = {
            'Val Accuracy':accuracy_score(y_true=y,y_pred=preds),
            'Val Precision':precision_score(y_true=y,y_pred=preds),
            'Val F1':f1_score(y_true=y,y_pred=preds),
            'Val Recall':recall_score(y_true=y,y_pred=preds)
        }
    return result
def train(model,X_train,X_test,y_train,y_test,name):
    wandb.init(project=PROJECT_NAME,name=name)
    model.fit(X_train,y_train)
    wandb.log(valid(model,X_test,y_test,True))
    wandb.log(valid(model,X_train,y_train,False))
    make_submission(model,name)
    save_model(model,name)
    wandb.finish()
def fe(data,col):
    max_num = data[col].quantile(0.99)
    min_num = data[col].quantile(0.05)
    data = data[data[col] > max_num]
    data = data[data[col] > min_num]
    return data

In [4]:
def object_to_int(data,col):
    old_data = data.copy()
    data = data[col].tolist()
    labels = {}
    idx = -1
    new_data = []
    for data_iter in data:
        if data_iter not in list(labels.keys()):
            idx += 1
            labels[data_iter] = idx
    for data_iter in data:
        new_data.append(labels[data_iter])
    old_data[col] = new_data
    return old_data,old_data[col],new_data,idx,labels

In [5]:
# data = pd.read_csv('./data/train.csv')
# data = data.sample(frac=1)

In [6]:
# data.head()

In [7]:
# old_data = data.copy()

In [8]:
# new_ticket = []
# tickets = data['Ticket']
# for ticket in tickets:
#     ticket = ticket.split(' ')
#     try:
#         ticket = int(ticket[0])
#     except:
#         try:
#             ticket = int(ticket[1])    
#         except:
#             try:
#                 ticket = int(ticket[2])
#             except:
#                 ticket = 0
#     new_ticket.append(ticket)

In [9]:
# data['Ticket'] = new_ticket

In [10]:
# data.head()

In [11]:
# new_names = []
# names = data['Name']
# for name in names:
#     name = name.split(' ')[1].replace('.','')
#     new_names.append(name)

In [12]:
# data['Name'] = new_names

In [13]:
# data,_,new_data,idx,labels = object_to_int(data,'Name')

In [14]:
# cabins = data['Cabin']

In [15]:
# new_cabins = []
# for cabin in cabins:
#     try:
#         cabin = cabin[:1]
#         new_cabins.append(cabin)
#     except:
#         new_cabins.append(5000)

In [16]:
# del data['Cabin']
# data['Cabins'] = new_cabins

In [17]:
# data,_,new_data,idx,labels = object_to_int(data,'Cabins')

In [18]:
# labels

In [19]:
# data['Cabins'].replace({0:np.nan},inplace=True)

In [20]:
# data.isna().sum()

In [21]:
# data['Cabins'].isna().sum()

In [22]:
# data['Cabins'].fillna(data['Cabins'].median(),inplace=True)

In [23]:
# data.isna().sum()

In [24]:
# data,_,new_data,idx,labels = object_to_int(data,'Embarked')
# data,_,new_data,idx,labels = object_to_int(data,'Age')

In [25]:
# data['Cabins'].fillna(data['Cabins'].median(),inplace=True)

In [26]:
# data.isna().sum()

In [27]:
# data,_,new_data,idx,labels = object_to_int(data,'Embarked')
# data,_,new_data,idx,labels = object_to_int(data,'Age')
# data,_,new_data,idx,labels = object_to_int(data,'Sex')
# data = data.astype(float)

In [28]:
# data.head()

In [29]:
# data = data.astype(float)

In [30]:
# data.head()

In [31]:
# data.to_csv('./data/cleaned-data.csv',index=False)

In [32]:
data = pd.read_csv('./data/cleaned-data.csv')

In [33]:
X = data.drop('Survived',axis=1)
y = data['Survived']

In [34]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.0625)

In [35]:
# train(RandomForestClassifier(),X_train,X_test,y_train,y_test,name='baseline')

In [36]:
# Decomposition
# from sklearn.decomposition import PCA
# from sklearn.decomposition import KernelPCA

In [37]:
# pca = KernelPCA(11)

In [38]:
# X_train = pca.fit_transform(X_train)
# X_test = pca.transform(X_test)

In [39]:
# train(RandomForestClassifier(),X_train,X_test,y_train,y_test,name='KernelPCA-decomposition')

In [40]:
# Feature Selection
# from sklearn.feature_selection import VarianceThreshold
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import RFECV
# from sklearn.feature_selection import SelectFromModel

In [41]:
# fs = SelectFromModel(RandomForestClassifier(),norm_order=11)

In [42]:
# X_train = fs.fit_transform(X_train,y_train)

In [43]:
# train(RandomForestClassifier(),X_train,X_test,y_train,y_test,name='SelectFromModel-decomposition')

In [44]:
# Preproccessing
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    MaxAbsScaler,
    OneHotEncoder,
    Normalizer,
    Binarizer
)

In [45]:
preprocessings = [Normalizer,Binarizer] # StandardScaler,RobustScaler,MinMaxScaler,MaxAbsScaler,

In [46]:
X_train_old = X_train.copy()
X_test_old = X_test.copy()

In [47]:
# for preprocessing in preprocessings:
#     X_train = X_train_old.copy()
#     X_test = X_test_old.copy()
#     preprocessing = preprocessing()
#     X_train = preprocessing.fit_transform(X_train)
#     X_test = preprocessing.transform(X_test)
#     train(RandomForestClassifier(),X_train,X_test,y_train,y_test,name=f'preprocessing-{preprocessing}')

In [48]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier,VotingClassifier,BaggingClassifier,RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoost,CatBoostClassifier
from xgboost import XGBClassifier,XGBRFClassifier

In [49]:
models = [
    ['KNeighborsClassifier',KNeighborsClassifier],
    ['DecisionTreeClassifier',DecisionTreeClassifier],
    ['GradientBoostingClassifier',GradientBoostingClassifier],
    ['AdaBoostClassifier',AdaBoostClassifier],
    ['VotingClassifier',VotingClassifier],
    ['BaggingClassifier',BaggingClassifier],
    ['RandomForestClassifier',RandomForestClassifier],
    ['SVC',SVC],
    ['BaggingClassifier',BaggingClassifier],
    ['ExtraTreesClassifier',ExtraTreesClassifier],
    ['CatBoost',CatBoost],
    ['CatBoostClassifier',CatBoostClassifier],
    ['XGBClassifier',XGBClassifier],
    ['XGBRFClassifier',XGBRFClassifier],
]

In [52]:
# for model in models:
#     try:
#         train(model[1](),X_train,X_test,y_train,y_test,name=f'model-{model[0]}')
#     except:
#         pass

In [54]:
# train(XGBClassifier(),X_train,X_test,y_train,y_test,name=f'XGBClassifier')

In [None]:
param_grid = {
    'n_estimators':[25,50,75,100,125,250,375,500,625,750,1000],
    'criterion':['gini','entropy'],
    'max_depth':[1,2,3,4,5,None],
    'min_samples_split':[2,2.5,1.25,5.0],
    'min_samples_leaf':[1,2,5,7,10],
    'max_features':['auto','sqrt','log2'],
    'bootstrap':[False,True],
    'oob_score':[False,True],
    'warm_start':[False,True],
    'class_weight':['balanced','balanced_subsample']
}
model = ExtraTreesClassifier()
model = GridSearchCV(model,cv=5,verbose=5,param_grid=param_grid).fit(X,y)