In [108]:
import pandas as pd
# pd.set_option("display.max_columns", None)   # show all columns
# pd.set_option("display.width", None)         # don't wrap lines
# pd.set_option("display.max_colwidth", None)  # show full column names
# pd.set_option("display.width", 200)
# pd.set_option("display.max_columns", None)


In [109]:
data = {
    'name': ['Alice', 'Bob', 'Charlie', None, 'David'],
    'age': [25, None, 35, None, 40],
    'salary': [50000, 60000, 70000, None, None],
    'city': ['New York', 'Los Angeles', None, 'Chicago', 'Houston']
}

df = pd.DataFrame(data)
print("Original DataFrame:\n", df)

Original DataFrame:
       name   age   salary         city
0    Alice  25.0  50000.0     New York
1      Bob   NaN  60000.0  Los Angeles
2  Charlie  35.0  70000.0         None
3     None   NaN      NaN      Chicago
4    David  40.0      NaN      Houston


In [110]:
df.isnull().sum()

name      1
age       2
salary    2
city      1
dtype: int64

In [111]:
df_drop = df.dropna()
print("DataFrame after dropping missing values:", df_drop, sep='\n', end='\n\n')

df_fill = df.fillna({
    'name': 'Unknown',
    'age': df['age'].mean(),
    'salary': df['salary'].median(),
    'city': 'Unknown'
})
print("DataFrame after filling missing values:\n", df_fill)

DataFrame after dropping missing values:
    name   age   salary      city
0  Alice  25.0  50000.0  New York

DataFrame after filling missing values:
       name        age   salary         city
0    Alice  25.000000  50000.0     New York
1      Bob  33.333333  60000.0  Los Angeles
2  Charlie  35.000000  70000.0      Unknown
3  Unknown  33.333333  60000.0      Chicago
4    David  40.000000  60000.0      Houston


In [114]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Tokyo', 'David'],
    'gender': ['Female', 'Male', 'Female', None, 'Male'],
    'result': ['pass', 'fail', 'pass', None, 'pass']
})

df_label = df.copy()

df_label.ffill(inplace=True)
df_ordinal = df_label.copy()
df_onehot = df_label.copy()

# initialize encoders
le = LabelEncoder()
oe = OrdinalEncoder()
ohe = OneHotEncoder(sparse_output=False)

# fit and transform the 'gender' and 'result' columns
df_label['gender_encoded'] = le.fit_transform(df_label['gender'])
df_label['result_encoded'] = le.fit_transform(df_label['result'])
df_ordinal[['gender_encoded', 'result_encoded']] = oe.fit_transform(
    df_ordinal[['gender', 'result']]
)

encoded_onehot = ohe.fit_transform(df_onehot[['gender', 'result']])
encoded_columns = ohe.get_feature_names_out(['gender', 'result'])
df_onehot_encoded = pd.DataFrame(encoded_onehot, columns=encoded_columns, index=df_onehot.index)
df_onehot = pd.concat([df_onehot, df_onehot_encoded], axis=1)

print('Using Label Encoding:')
print(df_label, end='\n\n')
print('Using Ordinal Encoding:')
print(df_ordinal, end='\n\n')
print('Using One-Hot Encoding:')
print(df_onehot)


Using Label Encoding:
      name  gender result  gender_encoded  result_encoded
0    Alice  Female   pass               0               1
1      Bob    Male   fail               1               0
2  Charlie  Female   pass               0               1
3    Tokyo  Female   pass               0               1
4    David    Male   pass               1               1

Using Ordinal Encoding:
      name  gender result  gender_encoded  result_encoded
0    Alice  Female   pass             0.0             1.0
1      Bob    Male   fail             1.0             0.0
2  Charlie  Female   pass             0.0             1.0
3    Tokyo  Female   pass             0.0             1.0
4    David    Male   pass             1.0             1.0

Using One-Hot Encoding:
      name  gender result  gender_Female  gender_Male  result_fail  result_pass
0    Alice  Female   pass            1.0          0.0          0.0          1.0
1      Bob    Male   fail            0.0          1.0          1.0      