# Titanic Dataset

> This module loads and checks the dataset from the Kaggle Titanic ML Competition

In [None]:
#| default_exp titanic_checkData

In [None]:
#| export
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

import os # interact with system directories and files
import wandb # log data and models with Weights and Biases
# import weave # interactive analytics

# %load_ext autoreload
# %autoreload 2

In [None]:
#| export
dataPath = '/Users/danc/Data/titanic'
for dirname, _, filenames in os.walk(dataPath):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/Users/danc/Data/titanic/test.csv
/Users/danc/Data/titanic/train_cleaned.csv
/Users/danc/Data/titanic/train.csv
/Users/danc/Data/titanic/gender_submission.csv


In [None]:
#| export
train_data = pd.read_csv(os.path.join(dirname, 'train.csv'))
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
#| hide
# weave.show(train_data)

In [None]:
#| export
test_data = pd.read_csv(os.path.join(dirname, 'test.csv'))
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
#| hide
# Exploratory Data Analysis
sns.heatmap(train_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
#| hide
#from sklearn.impute import SimpleImputer
#imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#imputer.fit([train_data['Age']])
#age_imputed = imputer.transform([train_data['Age']])
#train_data['Age'] = [age_imputed]

#from sklearn.experimental import enable_iterative_imputer
#from sklearn.impute import IterativeImputer
#imp = IterativeImputer(max_iter=10, random_state=0)
#imp.fit([train_data['Age'],train_data['Pclass']])
#t_transfm = imp.transform([train_data['Age'],train_data['Pclass']])
#train_data['Age'] = t_transfm[:,0]
#train_data['Pclass'] = t_transfm[:,1]

In [None]:
#| export
meanAgeByClass = train_data.groupby("Pclass")["Age"].mean()

In [None]:
#| export
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        # if Pclass == 1:
        #     return 

        # elif Pclass == 2:
        #     return 29

        # else:
        #     return 24
        return int(meanAgeByClass.iloc[int(Pclass)-1])
    else:
        return Age

In [None]:
#| export
train_data['Age'] = train_data[['Age','Pclass']].apply(impute_age,axis=1)

In [None]:
#| hide
# Exploratory Data Analysis
sns.heatmap(train_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
#| export
train_data.drop('Cabin',axis=1,inplace=True)

In [None]:
#| hide
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Sex',data=train_data,palette='RdBu_r')

In [None]:
#| hide
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=train_data,palette='rainbow')

In [None]:
#| hide
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [None]:
#| hide
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ["Embarked"])], remainder='passthrough')
# train_data = ct.fit_transform(train_data)

In [None]:
#| export
sex = pd.get_dummies(train_data['Sex'],drop_first=True)
embark = pd.get_dummies(train_data['Embarked'],drop_first=True)
train_data.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
train_data = pd.concat([train_data,sex,embark],axis=1)

In [None]:
#| hide
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,1,0,3,22.0,1,0,7.25,True,False,True
1,2,1,1,38.0,1,0,71.2833,False,False,False
2,3,1,3,26.0,0,0,7.925,False,False,True
3,4,1,1,35.0,1,0,53.1,False,False,True
4,5,0,3,35.0,0,0,8.05,True,False,True


In [None]:
#| export
train_data.to_csv(os.path.join(dirname, 'train_cleaned.csv'),index=False)

In [None]:
#| hide
cleantrain_data = pd.read_csv(os.path.join(dirname, 'train_cleaned.csv'))
cleantrain_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
0,1,0,3,22.0,1,0,7.25,True,False,True
1,2,1,1,38.0,1,0,71.2833,False,False,False
2,3,1,3,26.0,0,0,7.925,False,False,True
3,4,1,1,35.0,1,0,53.1,False,False,True
4,5,0,3,35.0,0,0,8.05,True,False,True


In [None]:
#| export
wandb.login() # log in Weights and Biases to upload and log data

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mp051tr0n00000[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
#| export
run = wandb.init(project="Kaggle_Titanic", job_type="add-dataset")
artifact = wandb.Artifact(name="Titanic_data", type="dataset")
artifact.add_dir(local_path=dataPath)  # Add dataset directory to artifact
run.log_artifact(artifact)  # Logs the artifact version "Titanic_data:v0"
run.finish()

[34m[1mwandb[0m: Adding directory to artifact (/Users/danc/Data/titanic)... Done. 0.1s


VBox(children=(Label(value='0.123 MB of 0.123 MB uploaded (0.089 MB deduped)\r'), FloatProgress(value=1.0, max…

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()