# Cleaning Not Null Values

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({
    'Sex' : ['M' , 'F' , 'F' , 'D', '?'],
    'Age' : [29, 30, 24, 290, 25]
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


*Above DataFrame has an invalid values and also missing values such 290 and D and ?*

### Finding Unique Values

*First step to clean invalid values is to notice and then identify them and finally handle them appropriately.*

In [3]:
df['Sex'].unique() # Returns unique values of the column

array(['M', 'F', 'D', '?'], dtype=object)

In [4]:
df['Sex'].value_counts() # Returns the count of each unique value in the column

Sex
F    2
M    1
D    1
?    1
Name: count, dtype: int64

In [5]:
df['Sex'].replace('D', 'F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [6]:
df['Sex'].replace({'D': 'F', '?': 'M'})

0    M
1    F
2    F
3    F
4    M
Name: Sex, dtype: object

In [7]:
df.replace({
    'Sex': {
        'D': 'F',
        '?': 'M'
    },
    'Age': {
        290: 29
    }
    
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,M,25


In [8]:
df[df['Age'] > 100] # Returns rows where Age is greater than 100

Unnamed: 0,Sex,Age
3,D,290


In [9]:
df.loc[df['Age'] > 100, 'Age'] = df.loc[df['Age'] > 100, 'Age'] / 10

In [10]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25


### Duplicates

In [11]:
Ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany'
], index=[
    'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Clemens von Goetze',
    'Emily Haber'
])
Ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Clemens von Goetze           Germany
Emily Haber                  Germany
dtype: object

In [12]:
Ambassadors.duplicated() # Returns a boolean Series where a value occurs more than once

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Clemens von Goetze     True
Emily Haber            True
dtype: bool

In [13]:
Ambassadors.duplicated(keep='last') # Mark duplicates as True except for the last occurrence

Gérard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Clemens von Goetze     True
Emily Haber           False
dtype: bool

In [14]:
Ambassadors.duplicated(keep=False) # Mark all duplicates as True

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Clemens von Goetze     True
Emily Haber            True
dtype: bool

In [15]:
Ambassadors.drop_duplicates() # Returns a Series with duplicate values removed

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [16]:
Ambassadors.drop_duplicates(keep='last') # Mark duplicates as True except for the last occurrence

Gérard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Emily Haber                  Germany
dtype: object

In [17]:
Ambassadors.drop_duplicates(keep=False) # Mark all duplicates as True

Gérard Araud          France
Armando Varricchio     Italy
dtype: object

### Duplicates in DataFrames

In [18]:
players = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant'
    ],
    'Pos': [
        'SG',
        'SF',
        'SG',
        'SF',
        'Sd'
    ]
})
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,Sd


In [19]:
players.duplicated() # Returns a boolean Series where a row is identical to a previous row

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [20]:
players.duplicated(subset=['Name']) # Returns a boolean Series where a row's Name value is identical to that of a previous row

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [21]:
players.duplicated(subset=['Name'], keep='last') # Mark duplicates as True except for the last occurrence

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [22]:
players.drop_duplicates() # Returns a DataFrame with duplicate rows removed

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,Sd


In [23]:
players.drop_duplicates(subset=['Name']) # Returns a DataFrame with duplicate Names removed


Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


In [24]:
players.drop_duplicates(subset=['Name'], keep='last') # Mark duplicates as True except for the last occurrence

Unnamed: 0,Name,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,Sd


## Text Handling

### Splitting Columns

In [25]:
df = pd.DataFrame({
    'Data':[
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',  
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
    ]})
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [26]:
df['Data'].str.split('_') # Splits the string on '_'

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4      [1985, F, I  T, 2]
Name: Data, dtype: object

In [27]:
df['Data'].str.split('_', expand=True) # Expands the split strings into separate columns

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [28]:
df = df['Data'].str.split('_', expand=True)

In [29]:
df.columns = ['Year', 'Sex', 'Country', 'Children']

In [30]:
df

Unnamed: 0,Year,Sex,Country,Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [31]:
df['Year'].str.contains('\?') # Returns a boolean Series where a value contains a '?'

  df['Year'].str.contains('\?') # Returns a boolean Series where a value contains a '?'


0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [32]:
df['Country'].str.contains('U') # Returns a boolean Series where a value contains either 'U' or 'I'

0     True
1     True
2     True
3    False
4    False
Name: Country, dtype: bool

In [33]:
df['Country'].str.strip() # Strips whitespace characters from both ends

0      US
1      UK
2      US
3      IT
4    I  T
Name: Country, dtype: object

In [34]:
df['Country'].str.replace(' ', '') # Replaces whitespace characters with empty strings

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object