**Importing Libraries :**

In [1]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import matplotlib as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from scipy.ndimage import shift
from IPython.display import clear_output
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
import seaborn
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
#import joypy
#import pywaffle

**Listing files under the input directory :**

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


**Reading CSV files:**

In [3]:
train_data = pd.read_csv(Path("/kaggle/input/titanic/train.csv"))
test_data = pd.read_csv(Path("/kaggle/input/titanic/test.csv"))
gender_submission = pd.read_csv(Path("/kaggle/input/titanic/gender_submission.csv"))

**Preparing the data:**

In [4]:
def opaw_tabular_histogram(dataframe,title="Histogram",maxcolors=32):
    #import matplotlib as plt
    canvas , pic = plt.pyplot.subplots(1,1,figsize=(12,6))
    comap = plt.cm.get_cmap("rainbow",256)
    newcolors = comap(np.linspace(0, 1, maxcolors))
    dataframe.plot.hist(ax=pic,color=newcolors)
    pic.set_title(title,fontsize= 20.0)
    pic.legend(ncol=2,loc ="upper right")
    canvas.tight_layout()
    canvas.show()
    return

In [5]:
def _fetch_token_index(value,arr):
    for i , x in enumerate(arr) :
        if (value==x):
            return i
    return


def add_token_index(dataframe,df_colname):
    for col in df_colname:
        tname = col + "_tokenize"
        arrname = dataframe[col].unique()
        dataframe[tname] = dataframe[col].apply(_fetch_token_index,args=(arrname,))
    return

In [6]:
def _fetch_checksum(dataframe):
    dataframe["checksum"] = dataframe.apply(lambda x:np.mean(tuple(x)),axis=1)
    return

In [7]:
def draw_tabular_correlogram(dataframe,title="",figsize=(6,5)):
    canvas = plt.pyplot.figure(figsize=figsize)
    seaborn.heatmap(dataframe.corr(),xticklabels=dataframe.corr().columns,yticklabels=dataframe.corr().columns,cmap="viridis_r",center=0,annot=True)
    plt.pyplot.title(title,fontsize=20.0)
    canvas.show()

In [8]:
def draw_tanular_joyplot(dataframe,x=[],y='network',t='',legloc='upper left'):
    canvas , pic = joypy.joyplot(dataframe,column=x,by=y,ylim='own',figsize=(12,6),overlap=1)
    plt.pyplot.title(t,fontsize=22)
    pics[0].legend(nloc=2,loc=legloc)
    canvas.tight_layout()
    canvas.show()
    return

In [9]:
def draw_tabular_waffle(dataframe_org,col,title='',legloc='lower center', anchor=(0.5, -0.5)):
    dataframe = dataframe_org.groupby(col).size().reset_index(name='counts')
    cat = dataframe.shape[0]
    colors = [pt.pyplot.cm.nipy_spectral(i/float(cat)) for i in range(cat)]
    canvas = matplotlib.pyplot.figure( FigureClass=pywaffle.Waffle,rows=4,values=df['counts'],colors=colors,figsize=(10, 8))
    canvas.tight_layout()
    canvas.show()
    return

In [10]:
train_data = train_data.drop(["Ticket",'Name'],axis=1)
X_train_data = train_data.drop("Survived",axis=1)
y_train_data = train_data["Survived"]
test_data = test_data.drop(["Ticket",'Name'],axis=1)

**Imputing:**

In [11]:
imputer = SimpleImputer(strategy="most_frequent")

In [12]:
imputer.fit(pd.concat([X_train_data,test_data]))

In [13]:
c = X_train_data.columns
i_train = X_train_data.index
i_test = test_data.index

In [14]:
X_train_data = pd.DataFrame(imputer.transform(X_train_data),columns = c,index = i_train)

In [15]:
test_data = pd.DataFrame(imputer.transform(test_data),columns = c,index=i_test)

**One-hot encoding:**

In [16]:
def OneHotE(dataframe,cat_col,cat_encoder):
    dataframe_num = dataframe.drop(cat_col,axis=1)
    dataframe_cat = dataframe[cat_col]
    dataframe_cat = cat_encoder.transform(dataframe_cat)
    dataframe_cat = pd.DataFrame(dataframe_cat.toarray(),columns=cat_encoder.get_feature_names_out(),index=dataframe.index)
    dataframe = pd.concat([dataframe_num,dataframe_cat],axis=1)    
    return dataframe

In [17]:
str_cat = ['Sex','Cabin','Embarked']

In [18]:
cat_encoder = OneHotEncoder()

In [19]:
cat_encoder.fit(pd.concat([X_train_data[str_cat],test_data[str_cat]]))

In [20]:
X_train_data = OneHotE(X_train_data,str_cat,cat_encoder)
test_data = OneHotE(test_data,str_cat,cat_encoder)

**Training a model:**

In [21]:
pipeline = Pipeline([("classifier",RandomForestClassifier(random_state=42))])

In [22]:
param_grid = [{"classifier__n_estimators":[80,90,100,120],"classifier__criterion":["gini","entropy","log_loss"]}] 

In [23]:
grid_search = GridSearchCV(pipeline,param_grid,cv=3,scoring="accuracy")

In [24]:
grid_search.fit(X_train_data,y_train_data.astype('int'))

In [25]:
grid_search.best_params_


{'classifier__criterion': 'entropy', 'classifier__n_estimators': 80}

In [26]:
scaler = StandardScaler()

In [27]:
for column in X_train_data.columns:
    print(column)
    X_train_data[column] = scaler.fit_transform(np.array(X_train_data[column]).reshape(-1,1))

PassengerId
Pclass
Age
SibSp
Parch
Fare
Sex_female
Sex_male
Cabin_A10
Cabin_A11
Cabin_A14
Cabin_A16
Cabin_A18
Cabin_A19
Cabin_A20
Cabin_A21
Cabin_A23
Cabin_A24
Cabin_A26
Cabin_A29
Cabin_A31
Cabin_A32
Cabin_A34
Cabin_A36
Cabin_A5
Cabin_A6
Cabin_A7
Cabin_A9
Cabin_B10
Cabin_B101
Cabin_B102
Cabin_B11
Cabin_B18
Cabin_B19
Cabin_B20
Cabin_B22
Cabin_B24
Cabin_B26
Cabin_B28
Cabin_B3
Cabin_B30
Cabin_B35
Cabin_B36
Cabin_B37
Cabin_B38
Cabin_B39
Cabin_B4
Cabin_B41
Cabin_B42
Cabin_B45
Cabin_B49
Cabin_B5
Cabin_B50
Cabin_B51 B53 B55
Cabin_B52 B54 B56
Cabin_B57 B59 B63 B66
Cabin_B58 B60
Cabin_B61
Cabin_B69
Cabin_B71
Cabin_B73
Cabin_B77
Cabin_B78
Cabin_B79
Cabin_B80
Cabin_B82 B84
Cabin_B86
Cabin_B94
Cabin_B96 B98
Cabin_C101
Cabin_C103
Cabin_C104
Cabin_C105
Cabin_C106
Cabin_C110
Cabin_C111
Cabin_C116
Cabin_C118
Cabin_C123
Cabin_C124
Cabin_C125
Cabin_C126
Cabin_C128
Cabin_C130
Cabin_C132
Cabin_C148
Cabin_C2
Cabin_C22 C26
Cabin_C23 C25 C27
Cabin_C28
Cabin_C30
Cabin_C31
Cabin_C32
Cabin_C39
Cabin_C45
Cabin_C

In [28]:
pipeline = Pipeline([("classifier",SVC(kernel='poly',random_state=42))])

In [29]:
param_grid = [{"classifier__C":[0.5,1,2,6],
               "classifier__coef0":[0.5,1,2],
              "classifier__degree":[2,3]}] 

In [30]:
grid_search = GridSearchCV(pipeline,param_grid,cv=3,scoring="accuracy")

In [31]:
%%time
grid_search.fit(X_train_data,y_train_data.astype('int'))

CPU times: user 4.79 s, sys: 81.5 ms, total: 4.87 s
Wall time: 4.87 s


In [32]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)

In [33]:
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_classifier__coef0,param_classifier__degree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
23,0.046394,0.002842,0.017352,0.000643,6.0,2,3,"{'classifier__C': 6, 'classifier__coef0': 2, '...",0.804714,0.821549,0.808081,0.811448,0.007274,1
21,0.043934,0.001275,0.017935,0.000881,6.0,1,3,"{'classifier__C': 6, 'classifier__coef0': 1, '...",0.794613,0.814815,0.787879,0.799102,0.011446,2
22,0.044091,0.000882,0.048183,0.043772,6.0,2,2,"{'classifier__C': 6, 'classifier__coef0': 2, '...",0.79798,0.808081,0.781145,0.795735,0.011111,3
20,0.043634,0.001373,0.01882,0.000398,6.0,1,2,"{'classifier__C': 6, 'classifier__coef0': 1, '...",0.79798,0.808081,0.781145,0.795735,0.011111,3
5,0.043007,0.002106,0.017442,0.000368,0.5,2,3,"{'classifier__C': 0.5, 'classifier__coef0': 2,...",0.79798,0.808081,0.781145,0.795735,0.011111,3


**Final Evaluation:**

In [34]:
final_evaluation = grid_search.predict(test_data)

In [35]:

d = {'PassengerId':np.arange(len(final_evaluation)) + 1+891,'Survived':final_evaluation}
df = pd.DataFrame(data=d)
df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,0
415,1307,0
416,1308,0


In [36]:
df.to_csv('out.csv',index=False)