In [2]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame({
    'Sex': ['M', 'F', 'F', 'D', '?'],
    'Age': [29, 30, 24, 290, 25],
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Sex     5 non-null      object
 1   Age     5 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 208.0+ bytes


Unique Values

In [None]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [None]:
df['Sex'].value_counts()

F    2
D    1
M    1
?    1
Name: Sex, dtype: int64

In [None]:
df['Sex'].replace('D','F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [None]:
df['Sex'].replace({'D':'F','?':'M'})

0    M
1    F
2    F
3    F
4    M
Name: Sex, dtype: object

In [None]:
df.replace({
    'Sex':{
        'D':'F',
        '?':'M'
    },
    'Age':{
        290 : 29
    }
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,M,25


In [None]:
df[df['Age']>100]

Unnamed: 0,Sex,Age
3,D,290


In [None]:
df.loc[df['Age']>100,'Age'] = df.loc[df['Age']>100,'Age']/10

In [None]:
df

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,24.0
3,D,29.0
4,?,25.0


#Dealing with Duplicate Values

In [None]:
amb = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
], index=[
    'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
])

In [None]:
amb

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [None]:
amb.duplicated() #Runs from top to down

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [None]:
amb[amb.duplicated()]

Peter Westmacott    United Kingdom
Peter Ammon                Germany
Klaus Scharioth            Germany
dtype: object

In [None]:
amb.duplicated(keep='last')#Runs from down to top i.e it checks for duplicates from down onwards

Gérard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [None]:
amb.duplicated(keep=False) #it returns duplicates as true

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [None]:
amb.drop_duplicates()

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [None]:
amb.drop_duplicates(keep='last')

Gérard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [None]:
amb.drop_duplicates(keep=False)

Gérard Araud          France
Armando Varricchio     Italy
dtype: object

#Duplicates in DataFrames

In [None]:
pl = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    'Pos': [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [None]:
pl

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [None]:
pl.duplicated() #We see that Kobe Brayant is repeated 3 times, but at last it says false bcoz it is paired up with different pos

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [None]:
pl.duplicated(keep='last')

0     True
1    False
2    False
3    False
4    False
dtype: bool

In [None]:
pl.duplicated(subset=['Name'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [None]:
pl.duplicated(subset=['Name'],keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

#Text Handling

In [3]:
df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
]})
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [4]:
df['Data'].str.split('_') #Date has .dt attribute, all categorical has .cat

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4      [1985, F, I  T, 2]
Name: Data, dtype: object

In [6]:
df = df['Data'].str.split('_',expand=True)

In [7]:
df.columns= ['Year','Sex','Country','No. of Children']

In [8]:
df

Unnamed: 0,Year,Sex,Country,No. of Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [9]:
df['Country'].str.strip()

0      US
1      UK
2      US
3      IT
4    I  T
Name: Country, dtype: object

In [None]:
df = df['Year'].