# panadas pre - processing.

# what is pandas pre-processing ?


Preprocessing is the process of doing a pre-analysis of data, 
in order to transform them into a standard and normalized format.

Preprocessing involves the following aspects:

missing values

data standardization

data normalization

data binning

In [1]:
import numpy as np
import pandas as pd

In [2]:
# pandas dataframe new.
data = {'country': ['Italy','Spain','Greece','France',np.nan],
        'popu': [61, 46, 11, 65,np.nan],
        'percent': [0.83,0.63,0.15,0.88,np.nan]}

df = pd.DataFrame(data, index=['ITA', 'ESP', 'GRC', 'FRA', 'PRT'])
df

Unnamed: 0,country,popu,percent
ITA,Italy,61.0,0.83
ESP,Spain,46.0,0.63
GRC,Greece,11.0,0.15
FRA,France,65.0,0.88
PRT,,,


# 1. Rename columns
Use rename() method of the DataFrame to change the name of a column


In [3]:
# Rename 'popu' column to 'population'
dfnew = df.rename(columns={'popu': 'population'})
dfnew

Unnamed: 0,country,population,percent
ITA,Italy,61.0,0.83
ESP,Spain,46.0,0.63
GRC,Greece,11.0,0.15
FRA,France,65.0,0.88
PRT,,,


# 2. Add columns
You can add a column to DataFrame object by assigning an array-like object (list, ndarray, Series) to a new column using the [ ] operator. 

This will modify the DataFrame 'in place'

In [4]:
# Add a list as a new column 
dfnew['capital city'] = ['Rome','Madrid','Athens','Paris','Lisbon']
dfnew

Unnamed: 0,country,population,percent,capital city
ITA,Italy,61.0,0.83,Rome
ESP,Spain,46.0,0.63,Madrid
GRC,Greece,11.0,0.15,Athens
FRA,France,65.0,0.88,Paris
PRT,,,,Lisbon


In [5]:
# Add an array as a new column 
ar = np.array([39,34,30,33,351])
ar
dfnew['Calling code'] = ar
dfnew

Unnamed: 0,country,population,percent,capital city,Calling code
ITA,Italy,61.0,0.83,Rome,39
ESP,Spain,46.0,0.63,Madrid,34
GRC,Greece,11.0,0.15,Athens,30
FRA,France,65.0,0.88,Paris,33
PRT,,,,Lisbon,351


In [6]:
# Add a Series array as a new column 
# When adding a Series data are automatically aligned based on index 
ser = pd.Series(['es','it','fr','pt','gr'], index = ['ESP','ITA','FRA','PRT','GRC'])
dfnew['Internet domain'] = ser
dfnew

Unnamed: 0,country,population,percent,capital city,Calling code,Internet domain
ITA,Italy,61.0,0.83,Rome,39,it
ESP,Spain,46.0,0.63,Madrid,34,es
GRC,Greece,11.0,0.15,Athens,30,gr
FRA,France,65.0,0.88,Paris,33,fr
PRT,,,,Lisbon,351,pt


# 3. Deleting columns 
# Task to delete or drop the coloumn from the original dataframe.

In [7]:
# Delete using del 
del dfnew['Internet domain']
dfnew

Unnamed: 0,country,population,percent,capital city,Calling code
ITA,Italy,61.0,0.83,Rome,39
ESP,Spain,46.0,0.63,Madrid,34
GRC,Greece,11.0,0.15,Athens,30
FRA,France,65.0,0.88,Paris,33
PRT,,,,Lisbon,351


In [8]:
# Delete using drop() 
dfdrop = dfnew.drop(['Calling code'], axis=1)
dfdrop

Unnamed: 0,country,population,percent,capital city
ITA,Italy,61.0,0.83,Rome
ESP,Spain,46.0,0.63,Madrid
GRC,Greece,11.0,0.15,Athens
FRA,France,65.0,0.88,Paris
PRT,,,,Lisbon


In [None]:
data = pd.Series([0, 1, 2, 3, 4, 5,np.nan, 6, 7, 8])
data

In [12]:
df1 = pd.read_csv("C:heart.csv")

In [14]:
df1

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [15]:
df1.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [16]:
df1.head(10)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1
