#Hands on

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({
    'Sex':['M','F','F','D','?'],
    'Age':[29,30,24,290,25]
})

In [3]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [5]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [6]:
df['Sex'].value_counts()

Sex
F    2
M    1
D    1
?    1
Name: count, dtype: int64

In [7]:
df['Sex'].replace('D','F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [10]:
df['Sex'].replace({'D':'F','?':'M'})

0    M
1    F
2    F
3    F
4    M
Name: Sex, dtype: object

In [18]:
df.loc[df['Age']>100,'Age']/10

3    29.0
Name: Age, dtype: float64

In [23]:
df.loc[df['Age']>100,'Age'] = df.loc[df['Age']>100,'Age']/10

In [24]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25


###Duplicates

In [25]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany'
], index=[
    'Gerard Araud',
    'Kim Darroch',
    'Keter Westmacott',
    'Armando Varri',
    'Keter Wittig',
    'Keter Ammon',
    'Klaus Schar'
])

In [26]:
ambassadors

Gerard Araud                France
Kim Darroch         United Kingdom
Keter Westmacott    United Kingdom
Armando Varri                Italy
Keter Wittig               Germany
Keter Ammon                Germany
Klaus Schar                Germany
dtype: object

In [28]:
ambassadors.duplicated()

Gerard Araud        False
Kim Darroch         False
Keter Westmacott     True
Armando Varri       False
Keter Wittig        False
Keter Ammon          True
Klaus Schar          True
dtype: bool

In [29]:
ambassadors.duplicated(keep='last')

Gerard Araud        False
Kim Darroch          True
Keter Westmacott    False
Armando Varri       False
Keter Wittig         True
Keter Ammon          True
Klaus Schar         False
dtype: bool

In [30]:
ambassadors.duplicated(keep=False)

Gerard Araud        False
Kim Darroch          True
Keter Westmacott     True
Armando Varri       False
Keter Wittig         True
Keter Ammon          True
Klaus Schar          True
dtype: bool

In [31]:
ambassadors.drop_duplicates()

Gerard Araud             France
Kim Darroch      United Kingdom
Armando Varri             Italy
Keter Wittig            Germany
dtype: object

In [32]:
ambassadors.drop_duplicates(keep='last')

Gerard Araud                France
Keter Westmacott    United Kingdom
Armando Varri                Italy
Klaus Schar                Germany
dtype: object

In [33]:
ambassadors.drop_duplicates(keep=False)

Gerard Araud     France
Armando Varri     Italy
dtype: object

###Duplicates in Dataframes

In [37]:
players=pd.DataFrame({
    'Name':[
        'Kobe',
        'Lebron',
        'Kobe',
        'Carmelo',
        'Kobe'
    ],
    'pos':[
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [38]:
players

Unnamed: 0,Name,pos
0,Kobe,SG
1,Lebron,SF
2,Kobe,SG
3,Carmelo,SF
4,Kobe,SF


In [41]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [42]:
players.duplicated(subset='Name')

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [43]:
players.duplicated(subset='Name',keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [44]:
players.drop_duplicates(subset='Name')

Unnamed: 0,Name,pos
0,Kobe,SG
1,Lebron,SF
3,Carmelo,SF


In [45]:
players.drop_duplicates(subset='Name',keep='last')

Unnamed: 0,Name,pos
1,Lebron,SF
3,Carmelo,SF
4,Kobe,SF


In [46]:
players.drop_duplicates(subset='Name',keep=False)

Unnamed: 0,Name,pos
1,Lebron,SF
3,Carmelo,SF


#Text Handling

##Splitting Columns

In [47]:
df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
]})

In [48]:
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [50]:
df['Data'].str.split('_')

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4      [1985, F, I  T, 2]
Name: Data, dtype: object

In [51]:
df['Data'].str.split('_',expand=True)

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [52]:
df = df['Data'].str.split('_',expand=True)

In [53]:
df.columns = ['Year','Sex','Country','Children']

In [55]:
df

Unnamed: 0,Year,Sex,Country,Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2
