# Data Wrangling

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [14]:
boat = sns.load_dataset('titanic')
bt1 = boat
bt2 = boat

In [3]:
boat.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
# simple operation (math operation )
(boat["age"]+1).head(10)

0    23.0
1    39.0
2    27.0
3    36.0
4    36.0
5     NaN
6    55.0
7     3.0
8    28.0
9    15.0
Name: age, dtype: float64

## Dealing with missing values
- in a data set missing values are either? or N/A or NaN , 0 or a blank cell

In [5]:
# where exectly missing values are?
boat.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [6]:
boat.dropna(subset=['deck'], axis=0 ,inplace=True)

In [7]:
boat.isnull().sum()

survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

In [15]:
boat = boat.dropna()

In [16]:
boat.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [17]:
boat.shape

(182, 15)

## Replacing missing values with the average of that column

In [18]:
# finding an average (mean)
mean =bt1['age'].mean()
mean

29.69911764705882

In [19]:
bt1['age'] = bt1['age'].replace(np.nan , mean)

In [20]:
bt1.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

## Data Formatting

In [21]:
boat.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [22]:
boat['survived'] = boat['survived'].astype('float')
boat.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boat['survived'] = boat['survived'].astype('float')


survived        float64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [25]:
bt1['age']=bt1['age']*365
bt1.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,8030.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,9490.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,12775.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,10840.177941,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,19710.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,730.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,9855.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,5110.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [29]:
bt1.rename(columns={"age":"age in days"}, inplace=True)
bt1.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,8030.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,9490.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,12775.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Data Normalization

In [30]:
bt1.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,8030.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,9490.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,12775.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [31]:
bt3 = bt1[['age in days','fare']]
bt3.head()

Unnamed: 0,age in days,fare
0,8030.0,7.25
1,13870.0,71.2833
2,9490.0,7.925
3,12775.0,53.1
4,12775.0,8.05


In [33]:
# simple feature scalling
bt3['fare'] = bt3 ['fare']/bt3['fare'].max()
bt3['age in days'] = bt3 ['age in days']/bt3['age in days'].max()
bt3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bt3['fare'] = bt3 ['fare']/bt3['fare'].max()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bt3['age in days'] = bt3 ['age in days']/bt3['age in days'].max()


Unnamed: 0,age in days,fare
0,0.275,0.014151
1,0.475,0.139136
2,0.325,0.015469
3,0.4375,0.103644
4,0.4375,0.015713


In [34]:
# min - max method 
bt3['fare'] = (bt3["fare"]-bt3['fare'].min()) / (bt3['fare'].max()-bt3['fare'].min())
bt3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bt3['fare'] = (bt3["fare"]-bt3['fare'].min()) / (bt3['fare'].max()-bt3['fare'].min())


Unnamed: 0,age in days,fare
0,0.275,0.014151
1,0.475,0.139136
2,0.325,0.015469
3,0.4375,0.103644
4,0.4375,0.015713


In [35]:
# z score
bt3['fare'] = (bt3['fare']-bt3['fare'].mean()) / bt3['fare'].std()
bt3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bt3['fare'] = (bt3['fare']-bt3['fare'].mean()) / bt3['fare'].std()


Unnamed: 0,age in days,fare
0,0.275,-0.502163
1,0.475,0.786404
2,0.325,-0.48858
3,0.4375,0.420494
4,0.4375,-0.486064


In [37]:
bt2.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,8030.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,9490.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,12775.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [38]:
#log transformation 
bt2['fare'] = np.log(bt2["fare"])
bt2.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,8030.0,1,0,1.981001,S,Third,man,True,,Southampton,no,False
1,1,1,female,13870.0,1,0,4.266662,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,9490.0,0,0,2.070022,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,12775.0,1,0,3.972177,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,12775.0,0,0,2.085672,S,Third,man,True,,Southampton,no,True
