# Module 2 - Data Wrangling

### Dealing with missing values

In [19]:
# Missing values are '?' 'NA', etc.
# Solutions are: Droping missing values or to replace them.

import pandas as pd
import numpy as np
data={'Name':['Karan','Rohit','Juan','Sahil','Aryan'],'Age':[23,22,np.nan,21,24]}
df=pd.DataFrame(data)
df

# # droping missing values
df.dropna(inplace = True)
df

# replacing missing values
mean = df['Age'].mean()
df['Age'].replace(np.nan, mean)

(22.5,
     Name   Age
 0  Karan  23.0
 1  Rohit  22.0
 2   Juan   NaN
 3  Sahil  21.0
 4  Aryan  24.0)

### Data Formatting in Python

In [33]:
# Applying calculations to an entire column
data={'Name':['Karan','Rohit','Juan','Sahil','Aryan'],'Age':[23,22,26,21,24]}
df=pd.DataFrame(data)
df

df['Age'] = df['Age']/10
df.rename(columns = {'Age': 'Decades'}, inplace = True)
df['Decades'] = df['Decades'].astype('int')
df


Unnamed: 0,Name,Decades
0,Karan,2
1,Rohit,2
2,Juan,2
3,Sahil,2
4,Aryan,2


### Data Normalization in Python 

In [39]:
data={'Name':['Karan','Rohit','Juan','Sahil','Aryan'],'Age':[23,22,np.nan,21,24]}
df=pd.DataFrame(data)
df

# droping missing values
df.dropna(inplace = True)
df

#Normalizing with max value
df['Age'] = df['Age']/df['Age'].max()
df

# Normalizing with min-max
df['Age'] = ((df['Age']-df['Age'].min()) / (df['Age'].max()-df['Age'].min())).round(3)
df

# Normalizing with z-score
df['Age'] = (df['Age']-df['Age'].mean()) / (df['Age'].std())
df


Unnamed: 0,Name,Age
0,Karan,0.387995
1,Rohit,-0.387995
3,Sahil,-1.161662
4,Aryan,1.161662


### Binning in Python

In [42]:
data={'Name':['Karan','Rohit','Juan','Sahil','Aryan'],'Age':[23,22,np.nan,21,24]}
df=pd.DataFrame(data)
df

# droping missing values
df.dropna(inplace = True)
df

# creating bins
bins = np.linspace(min(df['Age']), max(df['Age']), 3)
group_names = ['Low','High']
df['Age'] = pd.cut(df['Age'], bins, labels=group_names, include_lowest=True)
df

Unnamed: 0,Name,Age
0,Karan,High
1,Rohit,Low
3,Sahil,Low
4,Aryan,High


### Turning categorical variables into quantitative variables in Python

In [44]:
data={'Name':['Karan','Rohit','Juan','Sahil','Aryan'],'Gender':['Female','Female','Male','Female','Male']}
df=pd.DataFrame(data)
df

pd.get_dummies(df['Gender'])

Unnamed: 0,Female,Male
0,1,0
1,1,0
2,0,1
3,1,0
4,0,1


### Lab 2