# Titanic Prediction Problem Example
______________________________________________________________________________________________________

In [22]:
import matplotlib
import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd 
import sklearn.linear_model


trainSet = pd.read_csv(r"Titanic\Data\train.csv")

**The above code imports necessary libraries and reads in values from a .csv file and loads them into the _DataFrame type_ defined within the pandas library.**

Entries are accessed via an indexing based on the columns of the csv file. For example, the TrainSet['Age'] access the column of the csv containing all of the ages.

In [23]:
# Drop vals
# trainSet.dropna(inplace=True)

# Replace vals with mean
trainSet.fillna(trainSet.mean(), inplace=True)

**The entries without a value are replaced with a _NaN_ value which will cause problems in training if not handled correctly.** 

There are two main options here, using the **dropna** function, which removes rows that have _NaN_ values, or the **fillna** function which replaces these values with a preset value (i.e. the average).

In [24]:
# Map version
#trainSet['Sex'] = map(lambda x: 1 if x == 'male' else 0, trainSet['Sex'])

# List comprehension version
trainSet['Sex'] = [1 if i == 'male' else '0' for i in trainSet['Sex'] ]

**This code prepares the data in the column which lists passengers in terms of gender. In order to train the logisitic regression model, this should be converted into numerical data**

The two methods shown here are equally valid ways in completing that task; however, the second option, which uses a list comprehension, is much more readable that the first.

_Note: A for loop can also be used, but it is slower than a list comprehension and does not provide a significant increase in readability in this particular instance_

In [32]:
X = np.array(trainSet[['Age','Sex','Pclass']]).reshape(-1,3)
y = trainSet['Survived']

for item in X:
        print(item)

model = sklearn.linear_model.LogisticRegression()
model.fit(X,y)

[22.0 1 3L]
[38.0 '0' 1L]
[26.0 '0' 3L]
[35.0 '0' 1L]
[35.0 1 3L]
[29.69911764705882 1 3L]
[54.0 1 1L]
[2.0 1 3L]
[27.0 '0' 3L]
[14.0 '0' 2L]
[4.0 '0' 3L]
[58.0 '0' 1L]
[20.0 1 3L]
[39.0 1 3L]
[14.0 '0' 3L]
[55.0 '0' 2L]
[2.0 1 3L]
[29.69911764705882 1 2L]
[31.0 '0' 3L]
[29.69911764705882 '0' 3L]
[35.0 1 2L]
[34.0 1 2L]
[15.0 '0' 3L]
[28.0 1 1L]
[8.0 '0' 3L]
[38.0 '0' 3L]
[29.69911764705882 1 3L]
[19.0 1 1L]
[29.69911764705882 '0' 3L]
[29.69911764705882 1 3L]
[40.0 1 1L]
[29.69911764705882 '0' 1L]
[29.69911764705882 '0' 3L]
[66.0 1 2L]
[28.0 1 1L]
[42.0 1 1L]
[29.69911764705882 1 3L]
[21.0 1 3L]
[18.0 '0' 3L]
[14.0 '0' 3L]
[40.0 '0' 3L]
[27.0 '0' 2L]
[29.69911764705882 1 3L]
[3.0 '0' 2L]
[19.0 '0' 3L]
[29.69911764705882 1 3L]
[29.69911764705882 1 3L]
[29.69911764705882 '0' 3L]
[29.69911764705882 1 3L]
[18.0 '0' 3L]
[7.0 1 3L]
[21.0 1 3L]
[49.0 '0' 1L]
[29.0 '0' 2L]
[65.0 1 1L]
[29.69911764705882 1 1L]
[21.0 '0' 2L]
[28.5 1 3L]
[5.0 '0' 2L]
[11.0 1 3L]
[22.0 1 3L]
[38.0 '0' 1L]
[45.0 1 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

**The data needs to be reshaped so that it can be passed into the learning function.**

The learning function in this model is **Logistic Regression**.


In [26]:
testSet = pd.read_csv(r"Titanic\Data\test.csv")
testSet.fillna(trainSet.mean(), inplace=True)
testSet['Sex'] = map(lambda x: 1 if x == 'male' else 0, testSet['Sex'])
X_test = np.array(testSet[['Age','Sex','Pclass']]).reshape(-1,3)


**This is the code to prepare the testing data**

In [27]:
dataList = []
for item in X_test:
        res = model.predict(item.reshape(1,-1))
        dataList.append(*res) 

**This code iterates over each sample and and uses the previously trained model to make a list of predictions**

The ***** in front of res strips the values of extra characters (i.e. [1] becomes 1)

In [30]:
dataDict = {'PassengerId': testSet['PassengerId'], 'Survived': dataList}

dataDF = pd.DataFrame.from_dict(dataDict)

dataDF.to_csv(path_or_buf=r"Titanic\Data\predictions.csv",mode='w',index=False)

print(dataList)

[0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 

**This takes the data that was created and writes it to a csv file**