In [1]:
#import libraries
import pandas as pd #data analysis and manipulation
import numpy as np #math manipulation if required
from scipy import stats #stat tests
from statsmodels.graphics.gofplots import qqplot # for qq plots

#for visuals
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
#load data and make copy for all analysis
data = pd.read_csv('train.csv')
copy = data.copy()


# Data Overview

A bird's eye view.

In [4]:
#get dimensions
copy.shape

(891, 12)

In [12]:
#get a view of the ddf
copy.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
#get feature name, data, types and non null values
copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
#check number of nulls
copy.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
#get percentage of nulls per features
(copy.isna().sum() / len(copy) * 100).round(2)

PassengerId     0.00
Survived        0.00
Pclass          0.00
Name            0.00
Sex             0.00
Age            19.87
SibSp           0.00
Parch           0.00
Ticket          0.00
Fare            0.00
Cabin          77.10
Embarked        0.22
dtype: float64

In [6]:
#check for duplicates
copy.duplicated().sum()

#note if duplicates present use the following:
#your_df = your_df.drop_duplicates()

0

In [10]:
#summary statisitics for numerical data
copy.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
#summary stats for categorical data
copy.describe(include='O')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


The purpose of this step is to get eyes on the dataset and forumlate a plan to eventually develop insights

In sum , you should be able to answer the following:
- What is the shape of the dataframe,
- What are the data types of the features,
- Are there duplicates
- Are there nulls? How many? Which feature?
- Identify areas for data cleaning
- Identify or think about some possible feature engineering possibilities if any.

# Data Cleaning

change feature names, change passenger id to str, drop nulls

Some of the feature names don't make the most sense like SibSp, but luckily the data comes with documentation and we can change the names. [Link](https://www.kaggle.com/competitions/titanic/data?select=train.csv) to documentation. 

We can also change the data type of PassengerId to str, we could probably even drop it as we already checked for duplicates.

In [3]:
# change feature to string not number
copy['PassengerId'] = copy['PassengerId'].astype(str)

#change column names to full names as per documentation
copy = copy.rename(columns={'SibSp': '# Sibiling/Spouses Aboard',
                            'Parch': '# Parents/Children Aboard',
                            'Pclass': 'Passenger class'})

#dictionary for changing embarked values
embarked_dict = {'C': 'Cherbourg',
                 'Q': 'Queenstown',
                 'S': 'Southhampton'}

#change the Embarked values to full names
copy['Embarked'] = copy['Embarked'].replace(embarked_dict)

#check work
copy.head()

Unnamed: 0,PassengerId,Survived,Passenger class,Name,Sex,Age,# Sibiling/Spouses Aboard,# Parents/Children Aboard,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,Southhampton
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,Cherbourg
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,Southhampton
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,Southhampton
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,Southhampton


In [4]:
#Round Fare to two decimal places
copy['Fare'] = copy['Fare'].round(2)
copy.head()

Unnamed: 0,PassengerId,Survived,Passenger class,Name,Sex,Age,# Sibiling/Spouses Aboard,# Parents/Children Aboard,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,Southhampton
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,Cherbourg
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,Southhampton
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,Southhampton
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,Southhampton


Ideally, you have a research question in mind and can identify relevent columns. Or if you have domain knowledge you can drop irrelevent columns. In this case, we can drop PassengerId, Name, and Ticket.

In [6]:
#drop columns
copy = copy.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
#check work
copy.head()

Unnamed: 0,Survived,Passenger class,Sex,Age,# Sibiling/Spouses Aboard,# Parents/Children Aboard,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,Southhampton
1,1,1,female,38.0,1,0,71.28,C85,Cherbourg
2,1,3,female,26.0,0,0,7.92,,Southhampton
3,1,1,female,35.0,1,0,53.1,C123,Southhampton
4,0,3,male,35.0,0,0,8.05,,Southhampton


## Nulls

Nulls are fun. You can drop them or impute them in a variety of ways. The trick is to really understand your problem and data. As long as you can justify your approach.

## Dropping nulls

## Imputation