In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

#Save this for when we need to submit data back to kaggle
test_df_ids = test_df.loc[0:,"PassengerId"]
#drop data
train_df = train_df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis = 1)
test_df = test_df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis = 1)

train_df.head(n=5)
seed = 7
np.random.seed(seed)

In [2]:
missing_values = ["Age", "Fare"]
for i in missing_values:
    mean = train_df.loc[:, i].mean()
    train_df.loc[:, i].fillna(mean, inplace =True)
    test_df.loc[:, i].fillna(mean, inplace =True)
#Non int case for embarked, only 2 values missing here so we drop them
train_df = train_df.dropna(axis = 'index')
    
#train_df.loc[:, "Age"].fillna(train_df.loc[:,"Age"].mean(), inplace =True)


#Scikit method. Will not implement atm.
#ageimp = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 1)
#x_train[:, [3,7]] = imp.transform(x_train[:,[3,7]])
#x_train[:, [3,7]] = imp.transform(x_train[:,[3,7]])
#Only works on seperate columns
#imp = imp.fit(x_train[:,[3,7]])
#x_train[:, [3,7]] = imp.transform(x_train[:,[3,7]])
#x_train[:,3] = np.asarray(imp.fit_transform(np.asmatrix(x_train[:,7])))

In [3]:
train_df = pd.get_dummies(train_df, columns = ["Sex", "Pclass","Embarked"])
test_df = pd.get_dummies(test_df, columns = ["Sex", "Pclass","Embarked"])
train_df.head(n=5)

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,1,0,7.25,0,1,0,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0,1,0,0
2,1,26.0,0,0,7.925,1,0,0,0,1,0,0,1
3,1,35.0,1,0,53.1,1,0,1,0,0,0,0,1
4,0,35.0,0,0,8.05,0,1,0,0,1,0,0,1


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

train, validate = train_test_split(train_df, test_size = 0.25, random_state = seed)
x_train = train.iloc[:,1:]
y_train = train.iloc[:,0]
x_val = validate.iloc[:,1:]
y_val = validate.iloc[:,0]

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(test_df)

In [7]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state = seed)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_val)

In [8]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_val, y_pred)
accuracy = accuracy_score(y_val, y_pred)

In [9]:
cm

array([[108,  26],
       [ 24,  65]])

In [10]:
accuracy

0.77578475336322872

In [11]:
predictions = clf.predict(x_test)

In [12]:
predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [13]:
inputdf = pd.read_csv("test.csv")
data_to_submit = pd.DataFrame({
    'PassengerId':inputdf.iloc[:,0],
    'Survived':predictions
})
data_to_submit.to_csv("log_reg_results.csv", index=False)

Resulting score on kaggle is 75,6%

In [16]:
#Measure mean training time
measures = 100
result= []
Sum = 0
for i in range(measures):
    import time
    start = time.time()
    test = clf.fit(x_train, y_train)
    finish = time.time()
    result = finish-start
    Sum = Sum+result
    

mean = Sum/measures
print('Total time elapsed: {time}' .format(time = Sum))
print('Average time elapsed per measure: {time}'.format(time = mean))

Total time elapsed: 0.20447230339050293
Average time elapsed per measure: 0.0020447230339050294


Lets see if it can still be improved.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
