In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({
    'A': [1, 2, None],
    'B': [None, 2, 3],
    'C': [4, None, 6]
})

In [3]:
# Count total missing values
df.isnull().sum().sum()

3

In [5]:
# Show which values are missing in a DataFrame (True/False).
df.isna()

Unnamed: 0,A,B,C
0,False,True,False
1,False,False,True
2,True,False,False


In [8]:
# Remove all rows with missing values.
df.dropna()

Unnamed: 0,A,B,C


In [13]:
# Fill missing values in a column with 0.
df[['A', 'B', 'C']].fillna(0)

Unnamed: 0,A,B,C
0,1.0,0.0,4.0
1,2.0,2.0,0.0
2,0.0,3.0,6.0


In [15]:
# Fill missing values in a column with "Not Available".
df[['A', 'B', 'C']].fillna('Not Available')

Unnamed: 0,A,B,C
0,1.0,Not Available,4.0
1,2.0,2.0,Not Available
2,Not Available,3.0,6.0


In [16]:
# Print the maximum and minimum value of a column.
df[['A', 'B', 'C']].max()

A    2.0
B    3.0
C    6.0
dtype: float64

In [17]:
df[['A', 'B', 'C']].min()

A    1.0
B    2.0
C    4.0
dtype: float64

In [20]:
# Find values in a column that are greater than 100.
df[df[['A', 'B', 'C']] > 100]

Unnamed: 0,A,B,C
0,,,
1,,,
2,,,


In [22]:
# Replace all values greater than 100 with 100.
df[df > 100] = 100
df

Unnamed: 0,A,B,C
0,1.0,,4.0
1,2.0,2.0,
2,,3.0,6.0


In [23]:
# Sort a column to check for extreme values.
df.sort_values(['A', 'B', 'C'], ascending=False)

Unnamed: 0,A,B,C
1,2.0,2.0,
0,1.0,,4.0
2,,3.0,6.0


In [24]:
# Remove the highest value from a numeric column.
df[df['A'] != df['A'].max()]

Unnamed: 0,A,B,C
0,1.0,,4.0
2,,3.0,6.0


In [28]:
df2 = pd.DataFrame({
    'Gender': ['Male', 'Female', 'Female', 'Female', 'Male'],
    'Age': [21, 23, 23, 22, 20],
    'Name': ['Steve', 'Zoya', 'Reshmi', 'Maggi', 'Max'],
    'City': ['Texas', 'Baghdad', 'Himachal', 'Tehran', 'Tokyo']
})
df2

Unnamed: 0,Gender,Age,Name,City
0,Male,21,Steve,Texas
1,Female,23,Zoya,Baghdad
2,Female,23,Reshmi,Himachal
3,Female,22,Maggi,Tehran
4,Male,20,Max,Tokyo


In [29]:
# Convert column Gender with Male/Female into 0/1.
df2['Gender'].map({'Male': 0, 'Female':1}).fillna(-1)

0    0
1    1
2    1
3    1
4    0
Name: Gender, dtype: int64

In [37]:
# Create separate columns for each City from a City column.
city_dummies = pd.get_dummies(df2['City'], prefix='City')
pd.concat([df2, city_dummies], axis=1)
df2.drop('City', axis=1)

Unnamed: 0,Gender,Age,Name
0,Male,21,Steve
1,Female,23,Zoya
2,Female,23,Reshmi
3,Female,22,Maggi
4,Male,20,Max


In [42]:
# Count how many times each category appears in a column.
df2[['Gender', 'Age']].value_counts()

Gender  Age
Female  23     2
        22     1
Male    20     1
        21     1
Name: count, dtype: int64

In [44]:
# Change all values in a column to lowercase.
df2['Gender'].str.lower()

0      male
1    female
2    female
3    female
4      male
Name: Gender, dtype: object

In [47]:
# Divide all values in a numeric column by 100.
df/100

Unnamed: 0,A,B,C
0,0.01,,0.04
1,0.02,0.02,
2,,0.03,0.06


In [48]:
# Subtract the minimum value from a numeric column.
df['A'] - df['A'].min()

0    0.0
1    1.0
2    NaN
Name: A, dtype: float64

In [51]:
# Convert all column values to range 0 to 1 manually.
col = 'A'

min_val = df[col].min()
max_val = df[col].max()

df[col] = (df[col] - min_val) / (max_val - min_val)
df

Unnamed: 0,A,B,C
0,0.0,,4.0
1,1.0,2.0,
2,,3.0,6.0


In [52]:
# Show the mean and standard deviation of a numeric column.
df['B'].mean()

2.5

In [53]:
df['C'].std()

1.4142135623730951

In [54]:
# Subtract column mean from each value (basic standardization).
df['B'] - df['B'].mean()

0    NaN
1   -0.5
2    0.5
Name: B, dtype: float64

In [55]:
# Round all values in a numeric column to 2 decimals.
df.round(2)

Unnamed: 0,A,B,C
0,0.0,,4.0
1,1.0,2.0,
2,,3.0,6.0
