In [1]:
import pandas as pd
df = pd.read_csv('train.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
# If you carefully observe the above summary of pandas, there are total 891 rows, Age
# shows only 714 (means missing), Embarked (2 missing) and Cabin missing a lot as well.
# Object data types are non-numeric so we have to find a way to encode them to numerical
# values.

In [4]:
# Lets drop some columns that don't conribute much

In [5]:
cols = ['Name','Ticket','Cabin']
df = df.drop(cols,axis=1)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


In [7]:
# Dropping rows having missing value

In [8]:
df = df.dropna()

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Sex          712 non-null    object 
 4   Age          712 non-null    float64
 5   SibSp        712 non-null    int64  
 6   Parch        712 non-null    int64  
 7   Fare         712 non-null    float64
 8   Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 55.6+ KB


In [10]:
# After dropping rows with missing values we find that the dataset is reduced to 712 rows
# from 891, which means we are wasting data. Machine learning models need data for
# training to perform well. So we preserve the data and make use of it as much as we can.
# We will see it later.

In [11]:
# Creating Dummy variables

In [12]:
# Now we convert the Pclass, Sex, Embarked to columns in pandas and drop them after conversion.

In [13]:
dummies = []
cols = ['Pclass','Sex','Embarked']
for col in cols:
    dummies.append(pd.get_dummies(df[col]))

In [14]:
titanic_dummies = pd.concat(dummies, axis=1)

In [15]:
# We have 8 columns transformed to columns where 1,2,3 represents passenger class.
# And finally we concatenate to the original dataframe column wise

In [16]:
df = pd.concat((df,titanic_dummies), axis=1)

In [17]:
# Now that we converted Pclass, Sex, Embarked values into columns, we drop the redundant same columns from the dataframe

In [18]:
df = df.drop(['Pclass','Sex','Embarked'], axis=1)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Age          712 non-null    float64
 3   SibSp        712 non-null    int64  
 4   Parch        712 non-null    int64  
 5   Fare         712 non-null    float64
 6   1            712 non-null    uint8  
 7   2            712 non-null    uint8  
 8   3            712 non-null    uint8  
 9   female       712 non-null    uint8  
 10  male         712 non-null    uint8  
 11  C            712 non-null    uint8  
 12  Q            712 non-null    uint8  
 13  S            712 non-null    uint8  
dtypes: float64(2), int64(4), uint8(8)
memory usage: 44.5 KB


In [20]:
# Taking care of missing data

In [21]:
df['Age'] = df['Age'].interpolate()

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Age          712 non-null    float64
 3   SibSp        712 non-null    int64  
 4   Parch        712 non-null    int64  
 5   Fare         712 non-null    float64
 6   1            712 non-null    uint8  
 7   2            712 non-null    uint8  
 8   3            712 non-null    uint8  
 9   female       712 non-null    uint8  
 10  male         712 non-null    uint8  
 11  C            712 non-null    uint8  
 12  Q            712 non-null    uint8  
 13  S            712 non-null    uint8  
dtypes: float64(2), int64(4), uint8(8)
memory usage: 44.5 KB


In [23]:
# Converting the Dataframe to Numpy

In [24]:
# Now that we have converted all the data to numeric, its time for preparing the data for machine learning models. This is where 
# scikit and numpy come into play:
# X = Input set with 14 attributes
# y = Small y Output, in this case ‘Survived’

In [25]:
X = df.values
Y = df['Survived'].values

In [26]:
X = np.delete(X, 1, axis=1)

NameError: name 'np' is not defined

In [27]:
import numpy as np

In [28]:
X = np.delete(X, 1, axis=1)

In [29]:
# X had still Survived values in it, which should not be there. So we drop in numpy column which is the 1st column.

In [30]:
# Now that we are ready with X and y, lets split the dataset for 70% Training and 30% test set using scikit model_selection

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)