## Data Wrangling on Titanic DataSet

In [3]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
# loading the titanic dataset
kashti = sns.load_dataset('titanic')
ks = sns.load_dataset('titanic')
# showing that titanic dataset
kashti

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [6]:
# checking the first five rows of titanic dataset
kashti.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [7]:
# simple operations (Math Operators)
(kashti['age']+12).head(10)

0    34.0
1    50.0
2    38.0
3    47.0
4    47.0
5     NaN
6    66.0
7    14.0
8    39.0
9    26.0
Name: age, dtype: float64

### Dealing with Missing Values

* in a data set missing values are either? or N/A or NAN, or 0 or blank cell.
* jb kbi data na ho kisi aik row mein kisi b aik parametter ka 

> Steps:
  
  1. koshish kren dobara data collect kar lein ya dekh lein agr khein glti hai.
  2. Missing values wala variable (column) hi nikal dein agr data pr effect ni hota ya simple row or data entry remove kr dein.
  3. Replace the missing value:
     > 1.how?
        * Average value of entire variable or similar data point
        * frequency or MODE replacement
        * Replace based on other function (Data sampler knows that)
        * ML Algorithems can also be used 
        * leave it like that
        
     > 2.Why?
        * its better because no data is lost
        * less accurate

In [8]:
# Where the exactly missing values are 
kashti.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [9]:
# use drop.na method
print(kashti.shape)
kashti.dropna(subset=['deck'], axis=0, inplace=True) # this will remove specifically 
# inplace = True modifies the data frame (inplace makes the changes in actual data set as well)

(891, 15)


In [10]:
# after doing dropna now we checking/finding the null value again
kashti.isnull().sum()

survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

In [11]:
# remove na from whole dataFrame
# to update the main dataFrame
kashti=kashti.dropna()
kashti.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [12]:
# checking the shape of dataset
kashti.shape

(182, 15)

### Replacing missing values with the average of that column 

In [13]:
# Finding an average(Mean)
mean = kashti['age'].mean()
mean

35.62318681318681

In [14]:
# replacing nan with mean of the data (updating as well)

kashti['age'] = kashti['age'].replace(np.nan, mean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kashti['age'] = kashti['age'].replace(np.nan, mean)


In [15]:
# chicking the null value again

kashti.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

### Data Formatting

* Data ko aik comman standard pr ly kr ana
* Ensure data is consistant and understandable
   >* easy to gather
   >* easy to workwith
      >* Faislabad / (FSD) dono mn se koi aik hi use krna hai sary dataset mn agr khein fsd use kia hai to jahan jahan faislabad
         hai usy b fsd kren or agr khein faislabad use kia hai to jahan jahan fsd hai odr faislabad kren
      >* convert g to kg or similar unit for all
      >* one standard unit for all
      >* ft! = cm

In [16]:
# know the data type and convert it into the known one
kashti.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [17]:
# use this method to convert dtypes from one to another format
kashti['survived'] = kashti['survived'].astype('int64')
kashti.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kashti['survived'] = kashti['survived'].astype('int64')


survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [18]:
# here we will convert the age into the days instead of years 
kashti['age'] = kashti['age']*365
kashti.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kashti['age'] = kashti['age']*365


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,19710.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,1460.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,21170.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True
21,1,2,male,12410.0,0,0,13.0,S,Second,man,True,D,Southampton,yes,True
23,1,1,male,10220.0,0,0,35.5,S,First,man,True,A,Southampton,yes,True
27,0,1,male,6935.0,3,2,263.0,S,First,man,True,C,Southampton,no,False
52,1,1,female,17885.0,1,0,76.7292,C,First,woman,False,D,Cherbourg,yes,False
54,0,1,male,23725.0,0,1,61.9792,C,First,man,True,B,Cherbourg,no,False


In [19]:
# always rename afterwords 
kashti.rename(columns={"age":"age in days"}, inplace=True)
kashti.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kashti.rename(columns={"age":"age in days"}, inplace=True)


Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,19710.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,1460.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,21170.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


### Data Normalization

1.Uniform the data

2.They have same impact

3.Aik machli samandr mn r aik jar mn

4.Also for computational reason
5.We normaliaze the data in a range to make the better comparison of that so that we can apply the plot betterly

In [20]:
# showing firt five rows of data set
kashti.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,19710.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,1460.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,21170.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [21]:
kashti = kashti[['age in days','fare']]
kashti.head()

Unnamed: 0,age in days,fare
1,13870.0,71.2833
3,12775.0,53.1
6,19710.0,51.8625
10,1460.0,16.7
11,21170.0,26.55


* the above data is really in wide range and we need to normalize and hard to compare 
* Normalization change the values to the range of 0 to 1 (now both variable has similar influance on our nodels)

### Method of Normalization

1.Simple feature Scaling

 * x(new)=x(old)/x(max)

2.Min-Max method

3.Z-Score(Standard score) -3 to +3

4.Log Transformation

In [22]:
# Simple feature Scaling
kashti['fare'] = kashti['fare'] / kashti['fare'].max()
kashti['age in days'] = kashti['age in days'] / kashti['age in days'].max()
kashti.head()

Unnamed: 0,age in days,fare
1,0.475,0.139136
3,0.4375,0.103644
6,0.675,0.101229
10,0.05,0.032596
11,0.725,0.051822


In [23]:
# Min-Max Mathod

kashti['fare'] = (kashti['fare']-kashti['fare'].min()) / (kashti['fare'].max()-kashti['fare'].min())
kashti.head()

Unnamed: 0,age in days,fare
1,0.475,0.139136
3,0.4375,0.103644
6,0.675,0.101229
10,0.05,0.032596
11,0.725,0.051822


In [24]:
# Z-score Method (standard score) (rang -3 to 3 center mn 0 hota hai)
kashti['fare'] = (kashti['fare']-kashti['fare'].mean()) / kashti['fare'].std()
kashti.head()

Unnamed: 0,age in days,fare
1,0.475,-0.099835
3,0.4375,-0.337554
6,0.675,-0.353732
10,0.05,-0.813428
11,0.725,-0.684654


In [25]:
# Log transformation Method

ks['fare'] = np.log(ks['fare'])
ks.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,1.981001,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,4.266662,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,2.070022,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,3.972177,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,2.085672,S,Third,man,True,,Southampton,no,True


### Binning

* grouping of values into smaller number of values (bins)

* Convert numerics into categories (Jwan,bchy,boorhy) or 1-16, 17-30 etc

* to have better understanding of groups 

   1.low vs mid vs heigh price
    

In [32]:
bins = np.linspace(min(ks['age']), max(ks['age']), 100)
age_groups = ["bchy","jwan","boorhy"]
ks['age'] = pd.cut(ks['age'], bins, labels=age_groups, include_lowest=True)
ks['age']

ValueError: Bin labels must be one fewer than the number of bin edges

###### converting categories into dummy values 
- easy to use for computation
- Male Female (0,1)

In [28]:
pd.get_dummies(ks['sex'])

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True
...,...,...
886,False,True
887,True,False
888,True,False
889,False,True
