In [None]:
import pandas as pd 
import numpy as np 
from pandas import Series,DataFrame
from sklearn.linear_model import LogisticRegression



#---------DATA PREP---------#

#Load data.
data = pd.read_csv("./train.csv")#read in as dataframe 
test = pd.read_csv("./test.csv")

testSet = test
trainingSet = data

#testSet.info()
#trainingSet.info()

#---------DATA PREPROCESSING-------#

#Fill in null age entries.
mean_age_training  = trainingSet["Age"].mean()
std_age_training  = trainingSet["Age"].std()
training_age_null = trainingSet["Age"].isnull().sum()

mean_age_test = testSet["Age"].mean()
std_age_test = testSet["Age"].std()
test_age_null = testSet["Age"].isnull().sum()

rand_training = np.random.randint(mean_age_training - std_age_training, mean_age_training + std_age_training, size = training_age_null)
rand_test = np.random.randint(mean_age_test - std_age_test, mean_age_test + std_age_test, size = test_age_null)

trainingSet["Age"][np.isnan(trainingSet["Age"])] = rand_training
testSet["Age"][np.isnan(testSet["Age"])] = rand_test

#Convert age to int.
trainingSet['Age'] = trainingSet['Age'].astype(int)
testSet['Age']    = testSet['Age'].astype(int)

#Fill in null fare entries.
mean_fare_test = testSet["Fare"].mean()
std_fare_test = testSet["Fare"].std()
test_fare_null = testSet["Fare"].isnull().sum()

rand_test_fare= np.random.randint(mean_fare_test - std_fare_test, mean_fare_test + std_fare_test, size = test_fare_null)
testSet["Fare"][np.isnan(testSet["Fare"])] = rand_test_fare

#testSet.info()
#trainingSet.info()

#Convert gender label into 0/1
#add attr. Sex_female/Sex_male
dummies_Sex_training = pd.get_dummies(trainingSet['Sex'], prefix= 'Sex')
trainingSet = pd.concat([trainingSet, dummies_Sex_training], axis=1)

dummies_Sex_test = pd.get_dummies(testSet['Sex'], prefix= 'Sex')
testSet = pd.concat([testSet, dummies_Sex_test], axis=1)




#trainingSet.info()

#------------Logistic Regression---------#

X_train = trainingSet.drop(["Survived","PassengerId","Name","Sex","Sex_male","Ticket","Cabin","Embarked"],axis=1)
Y_train = trainingSet["Survived"]
#X_test  = testSet.drop(["Survived","PassengerId","Name","Sex","Sex_male","Ticket","Cabin","Embarked"],axis=1).copy()
X_test  = testSet.drop(["PassengerId","Name","Sex","Sex_male","Ticket","Cabin","Embarked"],axis=1).copy()


logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
logreg.score(X_train, Y_train)


#------------RESULT SUBMISSION---------#
submission = pd.DataFrame({
        "PassengerId": testSet["PassengerId"],
        "Survived": Y_pred
    })

submission.to_csv('pred.csv', index=False)

In [None]:
#---------Data Analysis----------#

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
from sklearn.linear_model import LogisticRegression

%pylab inline

#Load data.
data = pd.read_csv("./train.csv")

#data.info()

#-------Examine the relationship between Pclass and Survival----#

#Calculate survival rate of diff Pclass.
fig = plt.figure()
fig.set(alpha=0.2) 

victim = data.Pclass[data.Survived == 0].value_counts()
survivor = data.Pclass[data.Survived == 1].value_counts()

survivalRate = survivor/data.Pclass.value_counts()

#plt.subplot2grid((2,3),(0,0)) 
#survivalRate.plot(kind='bar')
plt.title(u"Survival rate of passengers from different classes")
plt.xlabel(u"Pclass") 
plt.ylabel(u"Survival rate") 
plt.savefig("Pclass.png")
#plt.show()

#-------Examine the relationship between SibSp and Survival----#
victim = data.SibSp[data.Survived == 0].value_counts()
survivor = data.SibSp[data.Survived == 1].value_counts()
survivalRate = survivor/data.SibSp.value_counts()
#survivalRate.plot(kind='bar')
plt.title(u"Survival rate V.S Number of Siblings")
plt.xlabel(u"Number of Siblings") 
plt.ylabel(u"Survival rate") 
plt.savefig("SibSp.png")
#plt.show()

#-------Examine the relationship between Parch and Survival----#
victim = data.Parch[data.Survived == 0].value_counts()
survivor = data.Parch[data.Survived == 1].value_counts()
survivalRate = survivor/data.Parch.value_counts()
#survivalRate.plot(kind='bar')
plt.title(u"Survival rate V.S Number of Parents/Kids")
plt.xlabel(u"Number of Parents/Kids") 
plt.ylabel(u"Survival rate") 
plt.savefig("Parch.png")
#plt.show()

#-------Examine the relationship between Fare and Survival----#
victim = data.Fare[data.Survived == 0].value_counts()
survivor = data.Fare[data.Survived == 1].value_counts()


fareMax = data.Fare.max()
q1 = data.Fare.quantile(0.25)
q2 = data.Fare.quantile(0.5)
q3 = data.Fare.quantile(0.75)

bins_acc = []
bins = []
s_rate = []
bins_acc.append(data.Survived[data.Fare <= q1].value_counts())
bins_acc.append(data.Survived[data.Fare <= q2].value_counts())
bins_acc.append(data.Survived[data.Fare <= q3].value_counts())
bins_acc.append(data.Survived[data.Fare <= fareMax].value_counts())

bins.append(bins_acc[0]) 
#print(bins[0][1]/(bins[0][0]+bins[0][1]))
s_rate.append(bins[0][1]/(bins[0][0]+bins[0][1]))

for i in range(3):
    bins.append(bins_acc[i+1] - bins_acc[i])
    #print(bins[i+1])
    #print(bins[i+1][1]/(bins[i+1][0]+bins[i+1][1]))
    s_rate.append(bins[i+1][1]/(bins[i+1][0]+bins[i+1][1]))
    
    
#print(bins)
#print(bins[0][1])

#xs = [0.25,0.5,0.75,1]
xs = [0,0.25,0.5,0.75]
ys = s_rate
#plt.bar(xs,ys,width = 0.25)

#bins.append(data.Survived[data.Fare <=5].value_counts()) 


#survivalRate = survivor/data.Fare.value_counts()
#survivalRate.plot(kind='bar')
plt.title(u"Survival rate V.S Fare Quantile")
plt.xlabel(u"Fare Quantile") 
plt.ylabel(u"Survival rate") 
plt.savefig("Fare.png")
#plt.show()




#-------Examine the relationship between Sex and Survival----#
dummies_Sex = pd.get_dummies(data['Sex'], prefix= 'Sex')
data = pd.concat([data, dummies_Sex], axis=1)

victim = data.Sex_female[data.Survived == 0].value_counts()
survivor = data.Sex_female[data.Survived == 1].value_counts()
survivalRate = survivor/data.Sex_female.value_counts()
survivalRate.plot(kind='bar')
plt.title(u"Survival rate V.S. Gender")
plt.xlabel(u"Sex(1-Female,0-Male)") 
plt.ylabel(u"Survival rate") 
plt.savefig("Gender.png")
#plt.show()


