In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
gender_dataset = pd.read_csv('gender_submission.csv')
training_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')
training_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# x_train = training_set.iloc[:, [0,2,4,5,6,7,8,9,11]]
# y_train = training_set.iloc[:,1:2]

In [None]:
# x_test = test_set.iloc[:, [0,1,3,4,5,6,7,8,9,10]]
# y_test = gender_dataset.iloc[:,1]

In [None]:
training_set.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# Data Preprocessing

## Eliminating missing Data

In [None]:
from sklearn.impute import SimpleImputer
imputer_age = SimpleImputer(missing_values=np.nan, strategy='median')
imputer_embarked = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_fare = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

### Training set

In [None]:
imputer_age.fit(training_set.iloc[:, 5:6])
training_set.iloc[:, 5:6] = imputer_age.transform(training_set.iloc[:, 5:6])
imputer_embarked.fit(training_set.iloc[:, 11:12])
training_set.iloc[:, 11:12] = imputer_embarked.transform(training_set.iloc[:, 11:12])

### Test Set

In [None]:
imputer_age.fit(test_set.iloc[:, 4:5])
test_set.iloc[:, 4:5] = imputer_age.transform(test_set.iloc[:, 4:5])
imputer_embarked.fit(test_set.iloc[:, 10:11])
test_set.iloc[:, 10:11] = imputer_embarked.transform(test_set.iloc[:, 10:11])
imputer_fare.fit(test_set.iloc[:, 8:9])
test_set.iloc[:, 8:9] = imputer_fare.transform(test_set.iloc[:, 8:9])

## Encoding Columns

### Training Set

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2,4,11])], remainder='passthrough')
training_set = np.array(ct.fit_transform(training_set))

In [None]:
# training_set = training_set[:, 1:19]

In [None]:
print(training_set)

[[0.0 0.0 1.0 ... 'A/5 21171' 7.25 nan]
 [1.0 0.0 0.0 ... 'PC 17599' 71.2833 'C85']
 [0.0 0.0 1.0 ... 'STON/O2. 3101282' 7.925 nan]
 ...
 [0.0 0.0 1.0 ... 'W./C. 6607' 23.45 nan]
 [1.0 0.0 0.0 ... '111369' 30.0 'C148']
 [0.0 0.0 1.0 ... '370376' 7.75 nan]]


In [None]:
print(training_set.shape[1])

17


### Test Set

In [None]:
ct1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,3,10])], remainder='passthrough')
test_set = np.array(ct1.fit_transform(test_set))

In [None]:
print(test_set.shape[1])

16


### Removing Dummy Variables

In [None]:
training_set = training_set[:,[1,2,4,6,7,9,11,12,13,15,16]]

In [None]:
test_set = test_set[:,[1,2,4,6,7,10,11,12,14]]#if want cabin..inc. 15 too

## Splitting the dataset

### Training Set

In [None]:
x_train = training_set[:, [0,1,2,3,4,6,7,8,9]] #not inc. cabin
y_train = training_set[:,5:6]
y_train=y_train.astype('int')

### Test Set

In [None]:
x_test = test_set

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

###   Training Set

In [None]:
x_train[:, [5,6,7,8]] = sc.fit_transform(x_train[:, [5,6,7,8]])

### Test Set

In [None]:
x_test[:, [5,6,7,8]] = sc.transform(x_test[:,[5,6,7,8]])

In [None]:
print(np.concatenate((x_train, y_train),1))

[[0.0 1.0 1.0 ... -0.4736736092984604 -0.5024451714361923 0]
 [0.0 0.0 0.0 ... -0.4736736092984604 0.7868452935884461 1]
 [0.0 1.0 0.0 ... -0.4736736092984604 -0.4888542575852486 1]
 ...
 [0.0 1.0 0.0 ... 2.0089333664952354 -0.17626323901354432 0]
 [0.0 0.0 1.0 ... -0.4736736092984604 -0.04438103794142432 1]
 [0.0 1.0 1.0 ... -0.4736736092984604 -0.49237782784290063 0]]


# Model Selection

In [None]:
!pip install catboost



In [None]:
from catboost import CatBoostClassifier
classifier = CatBoostClassifier()
classifier.fit(x_train, y_train)

Learning rate set to 0.009807
0:	learn: 0.6869072	total: 49.6ms	remaining: 49.5s
1:	learn: 0.6800464	total: 50.7ms	remaining: 25.3s
2:	learn: 0.6737653	total: 51.8ms	remaining: 17.2s
3:	learn: 0.6672710	total: 52.9ms	remaining: 13.2s
4:	learn: 0.6613521	total: 53.9ms	remaining: 10.7s
5:	learn: 0.6550461	total: 55ms	remaining: 9.11s
6:	learn: 0.6502814	total: 55.6ms	remaining: 7.89s
7:	learn: 0.6442738	total: 56.7ms	remaining: 7.03s
8:	learn: 0.6386244	total: 57.8ms	remaining: 6.36s
9:	learn: 0.6348023	total: 58.5ms	remaining: 5.79s
10:	learn: 0.6298206	total: 59.6ms	remaining: 5.36s
11:	learn: 0.6254502	total: 60.6ms	remaining: 4.99s
12:	learn: 0.6225412	total: 61.3ms	remaining: 4.65s
13:	learn: 0.6195898	total: 62ms	remaining: 4.36s
14:	learn: 0.6150049	total: 63.2ms	remaining: 4.15s
15:	learn: 0.6103579	total: 64.3ms	remaining: 3.95s
16:	learn: 0.6053258	total: 65.8ms	remaining: 3.8s
17:	learn: 0.6008433	total: 67ms	remaining: 3.65s
18:	learn: 0.5962075	total: 68.2ms	remaining: 3.52s

<catboost.core.CatBoostClassifier at 0x7f05971e4710>

# Predicting Result

In [None]:
array_sum = np.sum(x_test)
np.isnan(array_sum)

False

In [None]:
print(classifier.score(x_train,y_train))

0.5387205387205387


In [None]:
y_pred = classifier.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_test = gender_dataset.iloc[:,1:2]
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[246  20]
 [ 30 122]]


0.8803827751196173