### Data Transformation in Pandas

In [2]:
#import libraries
import pandas as pd
import numpy as np

#### Finding Duplicate Values

In [4]:
data=pd.DataFrame({"a":[1,5]*3,"b":[1,1,2,3,2,3]})
data

Unnamed: 0,a,b
0,1,1
1,5,1
2,1,2
3,5,3
4,1,2
5,5,3


In [5]:
#To see whether the row is repeated or not, you can use the duplicated method.
data.duplicated()

0    False
1    False
2    False
3    False
4     True
5     True
dtype: bool

In [6]:
#To remove duplicate rows, you can use the drop_duplicates method which returns a data frame.
data.drop_duplicates()

Unnamed: 0,a,b
0,1,1
1,5,1
2,1,2
3,5,3


#### Mapping

In [8]:
import pandas as pd
data={'Name':['A','B','C','D'],'Age':[42,54,20,63]}
df=pd.DataFrame(data,index=['value1','value2','value3','value4'])
df

Unnamed: 0,Name,Age
value1,A,42
value2,B,54
value3,C,20
value4,D,63


Now, lets add a new column to show score

In [10]:
score={'A':24,'B':30,'C':36,'D':42}
# Converting the first character of the name to uppercase so that the names are same.
n=df["Name"].str.capitalize()
n

value1    A
value2    B
value3    C
value4    D
Name: Name, dtype: object

In [11]:
#Now, adding the score variable to df dataset using map() method.
df["Score"]=n.map(score)
df

Unnamed: 0,Name,Age,Score
value1,A,42,24
value2,B,54,30
value3,C,20,36
value4,D,63,42


#### Replacing

Rplacing is a method use to replace the values in the dataframe.

In [14]:
s=pd.Series([14,55,87,966,78,44])
s

0     14
1     55
2     87
3    966
4     78
5     44
dtype: int64

In [15]:
#replacing the value.
s.replace(87,np.nan)

0     14.0
1     55.0
2      NaN
3    966.0
4     78.0
5     44.0
dtype: float64

In [16]:
#replacing multiple values.
s.replace([78,14],[np.nan,566])

0    566.0
1     55.0
2     87.0
3    966.0
4      NaN
5     44.0
dtype: float64

#### Renaming

In [18]:
import pandas as pd
import numpy as np
df=pd.DataFrame(np.arange(12).reshape(3,4),index=[0,1,2],columns=["a","b","c","d"])
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [19]:
#Create a variable and pass it into the dataset
s=pd.Series(["one","two","three"])
df.index=df.index.map(s)
df

Unnamed: 0,a,b,c,d
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11


In [20]:
#capitalize the row and column names with the rename method
df.rename(index=str.title,columns=str.upper)

Unnamed: 0,A,B,C,D
One,0,1,2,3
Two,4,5,6,7
Three,8,9,10,11


we can also change the row or column names using the dictionary structure with the rename method.

In [22]:
df.rename(index={"one":"10"},columns={"d":"f"},inplace=True)
df

Unnamed: 0,a,b,c,f
10,0,1,2,3
two,4,5,6,7
three,8,9,10,11


#### Finding Specific value in a dataset

In [24]:
data=pd.DataFrame(np.random.randn(1000,4))
data

Unnamed: 0,0,1,2,3
0,-1.067569,0.579523,0.472205,0.648776
1,-1.406084,0.001911,-0.355387,0.313135
2,0.720195,1.172741,0.017576,1.776848
3,-0.243014,-1.321209,-1.052499,1.307300
4,1.539789,1.503651,-0.789117,1.834564
...,...,...,...,...
995,-0.535885,-0.018117,0.281553,1.515131
996,-1.036594,0.435353,1.090888,-0.073749
997,-0.023713,-0.901607,-1.279934,-0.719906
998,1.485677,0.485427,-0.106982,-1.347156


In [25]:
data.head()

Unnamed: 0,0,1,2,3
0,-1.067569,0.579523,0.472205,0.648776
1,-1.406084,0.001911,-0.355387,0.313135
2,0.720195,1.172741,0.017576,1.776848
3,-0.243014,-1.321209,-1.052499,1.3073
4,1.539789,1.503651,-0.789117,1.834564


In [26]:
#using the describe method to see summary statistics.
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.024922,-0.029414,-0.052484,0.057147
std,1.035269,1.022326,0.967774,1.046243
min,-3.117065,-3.092384,-2.937112,-3.069391
25%,-0.700705,-0.699404,-0.720745,-0.668194
50%,0.008356,-0.061537,-0.071768,0.082919
75%,0.73439,0.67861,0.609636,0.762951
max,2.911298,3.592588,3.573737,3.579397


In [27]:
#assigning the column with the 1st index of the data to the col variable.
col=data[1]

In [28]:
col[np.abs(col)>3]

190   -3.092384
308    3.075052
725    3.192177
790    3.592588
836   -3.029862
Name: 1, dtype: float64