In [37]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [38]:
from sklearn.metrics import mean_absolute_error, precision_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

#read training data file.
data = pd.read_csv('../input/titanic/train.csv')

#Extracted features
#Pclass because the ticket class can be an important factor
#Sex might also be a factor
#The lower the fare, the lower the social class, the lower the protection.
features = ['Pclass','Sex','Fare'];

#mapping all males to 0, and females to 1
#as scikit learn only handles floats
data.Sex = data.Sex.map({'male':0,'female':1})


#As Kaggle wants exactly 418 entry to be in the test file
#I used fillna to fill every NaN in Fare, with the Fare's mean without the NaNs.
data= data.fillna(value={'Fare':data.mean(skipna=True).Fare})
#we can assume that all the nan values in Sex is 1 and PClass is 2
data= data.fillna(value={'Sex':1})
data= data.fillna(value={'Pclass':2})


In [39]:
# X (Set of features data).
X = data[features];
#Y (Set of target).
y = data.Survived;

#split data
train_X,val_X, train_y, val_y = train_test_split(X, y, random_state=1) 

#create model
model = RandomForestClassifier(random_state=1);

#fit model
model.fit(train_X,train_y);

#predict primary testing values
predictions = model.predict(val_X)

#After building a Binary Classifier model, we need now to evaluate it
#to evaluate a binary classsifer, we can use the precision and recall attributes
#basically, precision is the classifier's ability to label true positive , not to label the false positive values.
#recall is the ability of the classifier to find all the positive values.
precision = precision_score(y_pred=predictions, y_true=val_y)
#print the precision of the classifier
print(precision)

0.9166666666666666


As the results say, the precision of the model is 91.6% which is nearly optimal.

In [40]:
#import test file
test_file = pd.read_csv('../input/titanic/test.csv');

#Remove all the rows containing NaN in the given columns.
test_data = test_file.fillna(0)

#map male to be 0, female to be 1
test_data.Sex = test_data.Sex.map({'male':0,'female':1})

#prepare features.
test_X = test_data[features]

#Predict Survival Factor of the resulting Data.
test_predictions = model.predict(test_X);

#make a numpy array of the passenger ID pandas data.
PID=np.array(test_data.PassengerId);

#make a data frame ot export to csv.
output = pd.DataFrame({'PassengerId':PID,'Survived': test_predictions})

#export to csv
output.to_csv('final.csv',index=False)