In [None]:
# I tried to do exploratory analysis of the data of onboard passengers from Titanic dataset.
# The data was downloaded from Kaggle website: https://www.kaggle.com/c/titanic/data, file train.csv.  
# The definition of all variables can be found on the same Kaggle page, in the Data Dictionary section.
# Read the data from the file into pandas DataFrame. Analyze, clean and transform the data to get some answers.

In [None]:
# first do some exploratory analysis of given dataset. For this I am using pandas and numpy libraries.

# import the python libraries and download dataset using read_csv() method
import pandas as pd
import numpy as np

titanic = pd.read_csv('train.csv')
titanic.head()

In [None]:
# Now use info() method to explore the dataset and get information about data. 
titanic.info()


In [None]:
# From the above info() method, we got  information about each columns are as below.

# PassengerID - passenger ID numbers
# Survived : Survival info - 0 = No; 1 = Yes
# Pclass: Passenger Class- 1 = 1st class; 2 = 2nd class; 3 = 3rd class
# Name: Name of passenger
# Sex: Sex of passenger
# Age: Age of passenger
# Sibsp: Number of Siblings/Spouses 
# Parch: Number of Parents/Children 
# Ticket: Passenger Ticket Number
# Fare: how much of Passenger Fare
# Cabin: Cabin number
# Embarked: Port initial

#Age and Fare attributes have float datatype, while PassengerId, Survived,Pclass, SibSp and Parch 
#have interger data. Name, Sex, Ticket, Cabin and Embarked have object datatype.

#There are 891 entries in the dataset. columns 'Age'  'Cabin'  and 'Embarked' 
# have  missing values.


In [None]:
# use isnull() function to detect for any missing values and sum() method to count those value

titanic.isnull().sum()

In [None]:
# From above data, Age column have total 177, Cabin have 687 and Embarked have 2 missing value. 
# We need to replace this missing values if possible or can exclude the columns depend on
# requiremnents.

In [None]:
# with the use of nunique() method, we can see how many number of unique records in each columns.

titanic.nunique()

In [None]:
# We can see from above information, that for Survived there are 2 diff number of records, 3 for 
#Pclass, 2 for Sex, 88 for Age, 7 for SibSp, 7 for Parch, 3 for Embarked. PassengerId and Name
#have 891, Ticket have 681, Fare 248 and Cabin 147.

In [None]:
# Use describe() method to see stastatical summary of imported data
titanic.describe()

In [None]:
# we can get different numeric values of columns  have datatype int or float. This  will returns
# counts, mean, min, std and percentile of column data. Object datatype columns  didn't included.
# here, Age column have mean is 29 but this is not true as some of the raws have missing values.
# We need to clean data and replace missing data or need to drop some of raws. We will work on it. 

In [None]:
# Now checking all columns(attributes) for their counts by their unique records

In [None]:
# check the counts of each records using Value_counts() method

# here, 0 is for Survived and 1 is for not survived passengers

titanic['Survived'].value_counts()

In [None]:
# Same as above, find Pclass counts for each different class
titanic['Pclass'].value_counts()

In [None]:
# There are 3 passenger class. 1st class have 216, 2nd class have 184 and 3rd class have 491 entry

In [None]:
# check for how many male and female passengers are
titanic['Sex'].value_counts()

In [None]:
# to check the age range

titanic['Age'].value_counts()

In [None]:
# check wheather passengers are travelling alone or with someone. that is with sibling or spouse or any one
titanic['SibSp'].value_counts()

In [None]:
# There are 6 different SibSp records are available for 891 passengers.
# We can tell that there are 608 passengers are travelling alone. passengers travelling with sibling
# or spouse are -  209 passengers with one , 218 passengers with 2, 18 passengers with 4 , 16
# passengers with 3 , 7 passengers with 8 ,5 passengers with 5 and 7 passengers with 8.

In [None]:
# check if the passengers are travelling with kids and if so how many. As parent-childeren combination

titanic['Parch'].value_counts()

In [None]:
# There are 7 Parch combinations are there.
# Countings of parents travelling with children are as below.
#678 passengers are alnoe, 118 are with 1 parent/child, 80 are with 2, 5 are with 5, 5 are with 3,
#4 are with 4 and 1 with 6 parent/child.

In [None]:
# check the passengers from there port name

titanic['Embarked'].value_counts()

In [None]:
# Filling the Missing Values:

#  filling the missing values for Column Embarked with common category as it have only 2 values 
# are missing.

In [None]:
#We saw from info() method that the type of Embarked column is object. After counting the unique
#values in Embarked column with unique(), we can see that there are 3 unique values in the 
#column. Considering that the data type should be categorical so changeing the datatype of column

titanic['Embarked']= titanic['Embarked'].astype('category')
titanic['Embarked'].dtype

In [None]:
# replace missing raws with major number of categories, that is 'S'

c_category = 'S'
data = [titanic]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(c_category)

In [None]:
# checking that missing value is filled

titanic['Embarked'].value_counts()

# now two missing raws values are replaced by 'S'

In [None]:
# now filling random values as the missing value in the 'Age' column

# Age column have 177 missing values. We can use random numbers between std, mean and is_null
# for missing values as 

In [None]:
data = [titanic]
# use for statement, 
for dataset in data:
    mean = titanic['Age'].mean()
    std = titanic['Age'].std()
    is_null = dataset['Age'].isnull().sum()
         # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
         # fill nan values in Age column with random values generated
    age_diff = dataset['Age'].copy()
    age_diff[np.isnan(age_diff)] = rand_age
    dataset['Age'] = age_diff
    dataset['Age'] = titanic['Age'].astype(int)

In [None]:
# checking that missing values are filled in the column
titanic['Age'].isnull().sum()

# null value for all raws of 'Age' are 0 now. so all the missing values are filled with mean, std

In [None]:
# Working with 'Name' column. 
 
titanic['Name'].head()

In [None]:
titanic['Name'].tail()

In [None]:
# We can see that, in the 'Name' column,  we have different title for name as Mr., Mrs., Miss, 
# Master, Rev etc. To find if there are more different titles avalable in data, use .extract() to string

# RegEx is used to check if a string contains the specific search pattern.
# Name of passenger is a series.Use Series.str.extract() function to extract title from the string
#in the data of the given series. 

# Use [a-zA-Z] - returns a match for any character alphabetically between a and z
# [+] - returens a match for any character 
# and then use .value_counts() to count the number


titanic['Title'] = titanic.Name.str.extract('([a-zA-Z]+)\.', expand=False)
titanic.Title.value_counts()

In [None]:
# The common title for passengers are Mr., Mrs., Miss, Master. while other titles like Dr, Rev, Mlle
# Col, Major, Jonkheer, Don, Mme, Capt, Ms, Countess, Lady and Sir have very less numbers.
#differentiate titles by 5 categories. Mr, Mrs, Miss, Master and Other

In [None]:
# Create new attribute 'TitleCategory' from  'Title', using title associated to each name.
#From TitleCategory, we can count survived passengers acording to its title.
# For this using 

In [None]:
titanic['TitleCategory'] = titanic['Title']
titanic.TitleCategory.replace(to_replace= ['Dr','Rev','Col','Major','Mlle','Lady','Mme','Don',
                                           'Jonkheer','Ms','Capt','Sir','Countess'],value=0, inplace=True)
titanic.TitleCategory.replace('Mr',1,inplace=True)
titanic.TitleCategory.replace('Miss',2,inplace=True)
titanic.TitleCategory.replace('Mrs',3,inplace=True)
titanic.TitleCategory.replace('Master',4,inplace=True)
SurvivedByTitle = titanic['TitleCategory'][titanic['Survived']==1]
print ('Category 1 with title Mr are: ', SurvivedByTitle.value_counts()[1])
print ('Category 2 with title Miss are: ', SurvivedByTitle.value_counts()[2])
print ('Category 3 with title Mrs are: ', SurvivedByTitle.value_counts()[3])
print ('Category 4 with title Master are: ', SurvivedByTitle.value_counts()[4])
print ('Category 0 with title other are: ', SurvivedByTitle.value_counts()[0])

In [None]:
# We can compare above survival data by category with total passengers by titles.
# Female passengers with titles Mrs. and Miss have more survival rate than male passengers.
# From total of 517 passengers with title Mr only 81 survived while title Mrs out of 125 survived 99.
# 

# Also, we can observe that female passengers with title Mrs has better number of survival
# as compared to female passengers with title Miss. This can be because maybe married women with 
# small children were given higher priority. Also the small boys also were given priority as can 
# be observed from the survived number are 23 out of 40 of title Master.

In [None]:
# To checking the passengers travelling are more than one together, create new 'Family' column.
# Where passengers with sibling or spouse and parent with children are consider as family.
# concatenate two columns, SibSp and Parch to create new attribute, Family

# passenger with Family returens True and single  returns False boolean value

titanic['Family'] = (titanic['SibSp'] > 0) | (titanic['Parch'] > 0)
titanic['Family']

In [None]:
# now check survival number of passengrs with and without family

titanic.groupby(['Sex','Family']).Survived.count()


In [None]:
# Let’s see if departure place has any effect on survival of the passenger. Here, C = Cherbourg,
# Q = Queenstown and S = Southmpton

titanic.groupby( ['Embarked','Sex', 'Pclass']).Survived.count()

In [None]:
# Southmpton port have more passengers boarded as compared to othe two port

In [None]:
# checking the counts 
titanic['Cabin'].value_counts()

In [None]:
titanic['Cabin'].apply(lambda x: str(x)[0]).value_counts()

#function apply() will apply a function to every element in the selected DataFrame. 
# apply(lambda x: {some transformation}) is going to treat each value in the column as the x and 
# transform it with the function after the colon. For example, cabin G6 is going to be transformed
# into str(G6)[0], which is just G. We need this string cast

In [None]:
t_dropped = titanic.drop(['Ticket','Fare','Cabin'], axis = 1).set_index('PassengerId')
t_dropped

# apply function drop() to remove some of column from the dataset as focusing on the Pclass, Sex,
# Age and Name categories.

In [None]:
##It calculates the count, the mean, standard deviation, minimum value, maximum value, 
#1st percentile, 2nd percentile, 3rd percentile of the columns with numeric values. 

t_dropped.describe()

In [None]:
t_grouped = t_dropped.groupby(['Survived','Pclass','Sex']).count()
t_grouped

# method groupby() used to group 3 columns and then count() method for counting the numbers

In [None]:
# Use the indexing operator .loc to select survived raws from dataset

t_grouped.loc[1]

In [None]:
#Found below information from above data.

# out of 891 passengers, 342 are survived and 549 not survived.

# From above data, we can tell that Pclass is affect to the survival rate of passengers. Pclass 1
# have more passengers survived than other two class.

# out of 342 Survived passengers, for Pclass 1-136, Pclass 2-87 and Pclass 3-119.
# While from 549 not survived passengers, for Pclass 1-80, Pclass 2-97 and Pclass 3-372.

# Same with Female have more survival rate than male.
# Out of 314 total Survived Female are 233 and not survived are 81.
# While out of 577 total survived male are 109 and not survived are 468.

#We also differintiate survived female and male passengers according to their class as below.
# Pclass 1-- female- 91  male-45
# Pclass 2--female- 70  male-17
# Pclass 3--female- 72  male-47

#Not survived male and female passenges are
# Pclass 1-- female- 3  male-77
# Pclass 2--female-  6 male-91
# Pclass 3--female- 72  male-300

In [None]:
# Conclusion:

# From all our exploratory analysis of titanic dataset, we can say that there are some of attributes
# have more survival rate over others. Such as Pclass, Sex category. Female have higher survival over male.
# We also see that Class(Socio-Economic status) of the passengers had played a role in their survival. 

# Still can get different information by exploring more attributes to cover another type of questions.