In [2]:
%matplotlib inline

In [None]:
path = "data/"

In [1]:
import numpy as np

np.set_printoptions(precision=4, linewidth=100)
import pandas as pd
from scipy.stats import mode
from theano.sandbox import cuda

SkipTest: You are importing theano.sandbox.cuda. This is the old GPU back-end and is removed from Theano. Use Theano 0.9 to use it. Even better, transition to the new GPU back-end! See https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

In [5]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    print(big_string)
    return np.nan

In [6]:
def phase1clean(df):
    # setting silly values to nan
    df.Fare = df.Fare.map(lambda x: np.nan if x == 0 else x)

    # Special case for cabins as nan may be signal
    df.Cabin = df.Cabin.fillna('Unknown')

    # Creating a title column from name
    title_list = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                  'Dr', 'Ms', 'Mlle', 'Col', 'Capt', 'Mme', 'Countess',
                  'Don', 'Jonkheer']

    df['Title'] = df['Name'].map(lambda x: substrings_in_string(x, title_list))

    # replacing all titles with mr, mrs, miss, master
    def replace_titles(x):
        title = x['Title']
        if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Countess', 'Mme']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms']:
            return 'Miss'
        elif title == 'Dr':
            if x['Sex'] == 'Male':
                return 'Mr'
            else:
                return 'Mrs'
        else:
            return title
    df['Title'] = df.apply(replace_titles, axis=1)

    # Turning cabin number into Deck
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df['Deck'] = df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

    # Creating new family size column
    df['Family_Size'] = df['SibSp'] + df['Parch']

    return df


In [7]:
def phase2clean(train, test):
    # data type dictionary
    data_type_dict = {'Pclass': 'ordinal', 'Sex': 'nominal', 'Age': 'numeric',
                      'Fare': 'numeric', 'Embarked': 'nominal', 'Title': 'nominal',
                      'Deck': 'nominal', 'Family_Size': 'ordinal'}

    # imputing nan values
    for df in [train, test]:
        classmeans = pd.pivot_table(df, values='Fare', index='Pclass', aggfunc='mean')
        #df.Fare = df.apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1)
        meanAge = np.mean(df.Age)
        df.Age = df.Age.fillna(meanAge)
        #modeEmbarked = mode(df.Embarked)[0][0]
        df.Embarked = df.Embarked.fillna('C')

    # Fare per person    
    for df in [train, test]:
        df['Fare_Per_Person'] = df['Fare'] / (df['Family_Size'] + 1)

    # Age times class
    for df in [train, test]:
        df['Age*Class'] = df['Age'] * df['Pclass']

    data_type_dict['Fare_Per_Person'] = 'numeric'
    data_type_dict['Age*Class'] = 'numeric'

    return [train, test, data_type_dict]


In [8]:
def discretise_numeric(train, test, data_type_dict, no_bins=10):
   N=len(train)
   M=len(test)
   test=test.rename(lambda x: x+N)
   joint_df=train.append(test)
   for column in data_type_dict:
       if data_type_dict[column]=='numeric':
           joint_df[column]=pd.qcut(joint_df[column], 10, duplicates='drop')
           data_type_dict[column]='ordinal'
   train=joint_df.ix[range(N)]
   test=joint_df.ix[range(N,N+M)]
   return train, test, data_type_dict

In [9]:
def clean(no_bins=0):
    trainpath = 'data/train.csv'
    testpath = 'data/test.csv'
    traindf = pd.read_csv(trainpath)
    testdf = pd.read_csv(testpath)

    traindf=phase1clean(traindf)
    testdf=phase1clean(testdf)

    traindf, testdf, data_type_dict=phase2clean(traindf, testdf)

    traindf, testdf, data_type_dict=discretise_numeric(traindf, testdf, data_type_dict)

    # create a submission file for kaggle
    predictiondf = pd.DataFrame(testdf['PassengerId'])
    predictiondf['Survived']=[0 for x in range(len(testdf))]
    predictiondf.to_csv('data/predictions.csv', index=False)
    
    return [traindf, testdf, data_type_dict]

In [33]:
def features_from_dataframe(df):
    features = df.drop({'Name', 'Ticket', 'Cabin', 'Embarked', 'Deck', 'Survived', 'PassengerId'}, axis=1)
    return features

def labels_from_dataframe(df):
    labels = df[['PassengerId', 'Survived']]
    return labels

def get_bn_layers(p):
    return [
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten()
    ]

traindf, testdf, data_type_dict = clean()
train_features = features_from_dataframe(traindf)
train_labels = labels_from_dataframe(traindf).values
test_features = features_from_dataframe(testdf)
test_labels = pd.read_csv('data/gender_submission.csv')
print(train_labels[:10])


[[  1.   0.]
 [  2.   1.]
 [  3.   1.]
 [  4.   1.]
 [  5.   0.]
 [  6.   0.]
 [  7.   0.]
 [  8.   0.]
 [  9.   1.]
 [ 10.   1.]]


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  # Remove the CWD from sys.path while we load stuff.
