In [1]:
import numpy as np
import pandas as pd

## 1. Part -Data Cleaning:  NaN (Non-Number) Values

In [2]:
value = np.nan
pd.isnull(value)

True

In [3]:
value = None
pd.notnull(value)

False

In [4]:
pd.isnull(pd.Series([None, 2, np.nan, ""]))

0     True
1    False
2     True
3    False
dtype: bool

In [5]:
pd.isnull(pd.DataFrame({
    'Sütun A': [1, np.nan, 7],
    'Sütun B': [np.nan, 2, np.nan]
}))

Unnamed: 0,Sütun A,Sütun B
0,False,True
1,True,False
2,False,True


In [6]:
pd.Series([1, 2, np.nan]).sum()

3.0

In [7]:
s = pd.Series([1, 2, np.nan, 4, np.nan])
s[pd.notnull(s)]

0    1.0
1    2.0
3    4.0
dtype: float64

In [8]:
s.dropna()

0    1.0
1    2.0
3    4.0
dtype: float64

## 2. Part -  Data Cleaning: Filling DataFrames and Null Values

In [10]:
df = pd.DataFrame({
    'Sütun A': [1, np.nan, 30, np.nan],
    'Sütun B': [2, 8, 31, np.nan],
    'Sütun C': [np.nan, 9, 32, 100],
    'Sütun D': [5, 8, 34, 110],
})

In [11]:
df

Unnamed: 0,Sütun A,Sütun B,Sütun C,Sütun D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [12]:
df.dropna()

Unnamed: 0,Sütun A,Sütun B,Sütun C,Sütun D
2,30.0,31.0,32.0,34


In [13]:
df.dropna(axis=1)

Unnamed: 0,Sütun D
0,5
1,8
2,34
3,110


In [14]:
df2 = pd.DataFrame({
    'Sütun A': [1, np.nan, 30],
    'Sütun B': [2, np.nan, 31],
    'Sütun C': [np.nan, np.nan, 100]
})

In [15]:
df2

Unnamed: 0,Sütun A,Sütun B,Sütun C
0,1.0,2.0,
1,,,
2,30.0,31.0,100.0


In [16]:
df2.dropna(how="all")

Unnamed: 0,Sütun A,Sütun B,Sütun C
0,1.0,2.0,
2,30.0,31.0,100.0


In [17]:
s

0    1.0
1    2.0
2    NaN
3    4.0
4    NaN
dtype: float64

In [18]:
s.fillna(0)

0    1.0
1    2.0
2    0.0
3    4.0
4    0.0
dtype: float64

In [19]:
s.fillna(s.mean())

0    1.000000
1    2.000000
2    2.333333
3    4.000000
4    2.333333
dtype: float64

In [20]:
s.fillna(method="ffill")

0    1.0
1    2.0
2    2.0
3    4.0
4    4.0
dtype: float64

In [21]:
s.fillna(method="bfill")

0    1.0
1    2.0
2    4.0
3    4.0
4    NaN
dtype: float64

## 3. Part - Data Cleaning: Duplicate Values and Text Processing

In [22]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
], index=[
    'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
])

In [23]:
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [25]:
ambassadors.duplicated()

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [27]:
ambassadors.duplicated(keep="last")

Gérard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [28]:
ambassadors.duplicated(keep=False)

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [29]:
ambassadors.drop_duplicates(keep=False)

Gérard Araud          France
Armando Varricchio     Italy
dtype: object

In [30]:
ambassadors.drop_duplicates(keep="first")

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [58]:
df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   ITA_1',
        '1985_F_I  TA_2'
]})

In [59]:
df["Data"].str.split("_")

0        [1987, M, US , 1]
1        [1990?, M, UK, 1]
2         [1992, F, US, 2]
3    [1970?, M,    ITA, 1]
4      [1985, F, I  TA, 2]
Name: Data, dtype: object

In [60]:
df["Data"].str.split("_", expand=True)

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,ITA,1
4,1985,F,I TA,2


In [61]:
df = df["Data"].str.split("_", expand=True)

In [62]:
df.columns = ["Year", "Sex", "Country", "Number of Children"]

In [63]:
df

Unnamed: 0,Year,Sex,Country,Number of Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,ITA,1
4,1985,F,I TA,2


In [64]:
df["Year"].str.contains("\?")

0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [65]:
df["Year"] = df["Year"].str.replace("\?", "")

  df["Year"] = df["Year"].str.replace("\?", "")


In [66]:
df["Year"]

0    1987
1    1990
2    1992
3    1970
4    1985
Name: Year, dtype: object

In [67]:
df["Country"] = df["Country"].str.replace(" ", "")

In [68]:
df["Country"]

0     US
1     UK
2     US
3    ITA
4    ITA
Name: Country, dtype: object

In [69]:
df

Unnamed: 0,Year,Sex,Country,Number of Children
0,1987,M,US,1
1,1990,M,UK,1
2,1992,F,US,2
3,1970,M,ITA,1
4,1985,F,ITA,2
