In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df=pd.DataFrame({
    'Sex':['M','F','F','D','?'],
    'Age':[29,30,24,290,25],
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [3]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [4]:
df['Sex'].value_counts()

Sex
F    2
M    1
D    1
?    1
Name: count, dtype: int64

In [5]:
df['Sex'].replace('D','F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [6]:
df['Sex'].replace({'D':'F','N':'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [7]:
df.replace({
    'Sex':{'D':'F','N':'M'},
    'Age':{290:29}
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [8]:
df[df['Age']>100]

Unnamed: 0,Sex,Age
3,D,290


In [9]:
df.loc[df['Age']>100,'Age']=df.loc[df['Age']>100,'Age']/10

In [10]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25


## Dealing with Duplicates

In [11]:
ambassadors=pd.Series(['France','United Kingdom','United Kingdom','Italy','Germany','Germany','Germany',],
                      index=['Gerard Araud','Kim Darroch','Peter Westmacott','Armando Vrricchio','Peter Witting','Peter Ammon','Klaus Scharioth',])

In [12]:
ambassadors

Gerard Araud                 France
Kim Darroch          United Kingdom
Peter Westmacott     United Kingdom
Armando Vrricchio             Italy
Peter Witting               Germany
Peter Ammon                 Germany
Klaus Scharioth             Germany
dtype: object

In [13]:
ambassadors.duplicated()

Gerard Araud         False
Kim Darroch          False
Peter Westmacott      True
Armando Vrricchio    False
Peter Witting        False
Peter Ammon           True
Klaus Scharioth       True
dtype: bool

In [14]:
ambassadors.duplicated(keep='last')

Gerard Araud         False
Kim Darroch           True
Peter Westmacott     False
Armando Vrricchio    False
Peter Witting         True
Peter Ammon           True
Klaus Scharioth      False
dtype: bool

In [15]:
ambassadors.duplicated(keep=False)

Gerard Araud         False
Kim Darroch           True
Peter Westmacott      True
Armando Vrricchio    False
Peter Witting         True
Peter Ammon           True
Klaus Scharioth       True
dtype: bool

In [16]:
ambassadors.drop_duplicates()

Gerard Araud                 France
Kim Darroch          United Kingdom
Armando Vrricchio             Italy
Peter Witting               Germany
dtype: object

In [17]:
ambassadors.drop_duplicates(keep='last')

Gerard Araud                 France
Peter Westmacott     United Kingdom
Armando Vrricchio             Italy
Klaus Scharioth             Germany
dtype: object

In [18]:
ambassadors.drop_duplicates(keep=False)

Gerard Araud         France
Armando Vrricchio     Italy
dtype: object

In [19]:
players=pd.DataFrame({
    'Name':['Kobe Bryant','Lebron James','Kobe Bryant','Carmelo Anthony','Kobe Bryant',],
    'Pos':['SG','SF','SG','SF','SF',]
})

In [20]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,Lebron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [21]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [22]:
players.duplicated(subset=['Name'],keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [23]:
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,Lebron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [24]:
players.drop_duplicates(subset=['Name'])

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,Lebron James,SF
3,Carmelo Anthony,SF


In [25]:
players.drop_duplicates(subset=['Name'],keep='last')

Unnamed: 0,Name,Pos
1,Lebron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


### Spliting Cloumns

In [26]:
df=pd.DataFrame({'Data':['1987_M_US _1','1990?_M_UK_1','1992_f_UK_2','1970?_M  IT_1','1985_F_I   T_2']})

In [27]:
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_f_UK_2
3,1970?_M IT_1
4,1985_F_I T_2


In [28]:
df. info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    5 non-null      object
dtypes: object(1)
memory usage: 172.0+ bytes


In [29]:
df['Data'].str.split('_')

0      [1987, M, US , 1]
1      [1990?, M, UK, 1]
2       [1992, f, UK, 2]
3      [1970?, M  IT, 1]
4    [1985, F, I   T, 2]
Name: Data, dtype: object

### the expand command creates a data frame out of the data

In [30]:
df['Data'].str.split('_',expand=True)

Unnamed: 0,0,1,2,3
0,1987,M,US,1.0
1,1990?,M,UK,1.0
2,1992,f,UK,2.0
3,1970?,M IT,1,
4,1985,F,I T,2.0


In [31]:
df = df['Data'].str.split('_', expand=True)

In [32]:
df.columns = ['Year','Sex','Country','No Children']

In [33]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1.0
1,1990?,M,UK,1.0
2,1992,f,UK,2.0
3,1970?,M IT,1,
4,1985,F,I T,2.0


In [34]:
df['Year'].str.contains('\?')

0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [35]:
df['Country'].str.contains('U')

0     True
1     True
2     True
3    False
4    False
Name: Country, dtype: bool

In [36]:
df['Country'].str.strip()

0       US
1       UK
2       UK
3        1
4    I   T
Name: Country, dtype: object

In [37]:
df['Country'].str.replace(' ','')

0    US
1    UK
2    UK
3     1
4    IT
Name: Country, dtype: object

In [38]:
df['Year'] = df['Year'].str.replace(r'(?P<year>\d{4})\??', lambda m: m.group('year'), regex=True)

In [39]:
df['Year']

0    1987
1    1990
2    1992
3    1970
4    1985
Name: Year, dtype: object