In [26]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
gender_dataset = pd.read_csv('gender_submission.csv')
training_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')

# Data Preprocessing

## Eliminating missing Data

In [28]:
from sklearn.impute import SimpleImputer
imputer_age = SimpleImputer(missing_values=np.nan, strategy='median')
imputer_embarked = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_fare = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

### Training set

In [29]:
imputer_age.fit(training_set.iloc[:, 5:6])
training_set.iloc[:, 5:6] = imputer_age.transform(training_set.iloc[:, 5:6])
imputer_embarked.fit(training_set.iloc[:, 11:12])
training_set.iloc[:, 11:12] = imputer_embarked.transform(training_set.iloc[:, 11:12])

### Test Set

In [30]:
imputer_age.fit(test_set.iloc[:, 4:5])
test_set.iloc[:, 4:5] = imputer_age.transform(test_set.iloc[:, 4:5])
imputer_embarked.fit(test_set.iloc[:, 10:11])
test_set.iloc[:, 10:11] = imputer_embarked.transform(test_set.iloc[:, 10:11])
imputer_fare.fit(test_set.iloc[:, 8:9])
test_set.iloc[:, 8:9] = imputer_fare.transform(test_set.iloc[:, 8:9])

## Encoding Columns

### Training Set

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2,4,11])], remainder='passthrough')
training_set = np.array(ct.fit_transform(training_set))

### Test Set

In [32]:
ct1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,3,10])], remainder='passthrough')
test_set = np.array(ct1.fit_transform(test_set))

### Removing Dummy Variables

In [33]:
training_set = training_set[:,[1,2,4,6,7,9,11,12,13,15,16]]

In [34]:
test_set = test_set[:,[1,2,4,6,7,10,11,12,14]]#if want cabin..inc. 15 too

## Splitting the dataset

### Training Set

In [35]:
x_train = training_set[:, [0,1,2,3,4,6,7,8,9]] #not inc. cabin
y_train = training_set[:,5:6]
y_train=y_train.astype('int')

### Test Set

In [36]:
x_test = test_set

## Feature Scaling

In [37]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

###   Training Set

In [38]:
x_train[:, [5,6,7,8]] = sc.fit_transform(x_train[:, [5,6,7,8]])

### Test Set

In [39]:
x_test[:, [5,6,7,8]] = sc.transform(x_test[:,[5,6,7,8]])

# Model Selection

In [64]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf',gamma='auto')
classifier.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# Predicting Result

In [65]:
print(classifier.score(x_train,y_train))

0.8327721661054994


In [66]:
y_pred = classifier.predict(x_test)

## Confusion Matrix and Accuracy Score

In [67]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_test = gender_dataset.iloc[:,1:2]
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[253  13]
 [  9 143]]


0.9473684210526315

# Converting to .csv

## Creating a pd dataframe

In [88]:
y_pred_1 = pd.DataFrame(y_pred)
y_pred_2 = pd.DataFrame(gender_dataset.iloc[:,0:1])
y_final = pd.concat([y_pred_2,y_pred_1],axis = 1)
y_final.rename({0: 'Survived'},axis=1,inplace=True)
y_final.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## Exporting it to .csv format

In [90]:
y_final.to_csv('my_submission.csv',index = False)

Your submission is saved
     PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
