## Data preparation

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Handling Missing Values in DataFrames

In [3]:
df = pd.DataFrame({"id":[np.nan,2,3,4,5],"grade":[np.nan,"b",np.nan,"c",np.nan],
                   "award":[np.nan, "gold", "silver","bronze", np.nan]})
display(df)

Unnamed: 0,award,grade,id
0,,,
1,gold,b,2.0
2,silver,,3.0
3,bronze,c,4.0
4,,,5.0


In [4]:
# Dropping rows or columns with missing values

display(df.dropna(how="any"))

Unnamed: 0,award,grade,id
1,gold,b,2.0
3,bronze,c,4.0


In [7]:
display(df.dropna(how="all",subset=["grade","award"]))

Unnamed: 0,award,grade,id
1,gold,b,2.0
2,silver,,3.0
3,bronze,c,4.0


In [8]:
# Imputing missing values
values = {"grade": "e", "award": "iron"}
display(df.fillna(value=values))

Unnamed: 0,award,grade,id
0,iron,e,
1,gold,b,2.0
2,silver,e,3.0
3,bronze,c,4.0
4,iron,e,5.0


In [None]:
df["id"].fillna(df["id"].mean(),inplace=True)
display(df)

In [None]:
df["award"].fillna(df[
    "award"].mode()[0],inplace=True)
display(df)

In [6]:
# Binning in DataFrames

# Equal-width binning (using cut)

df = pd.DataFrame({"values": np.random.rand(100)})
res, bins = pd.cut(df["values"],10,retbins=True)
display(res)

0       (0.897, 0.997]
1         (0.401, 0.5]
2       (0.599, 0.699]
3       (0.699, 0.798]
4         (0.5, 0.599]
5       (0.599, 0.699]
6         (0.5, 0.599]
7       (0.202, 0.302]
8       (0.103, 0.202]
9       (0.599, 0.699]
10      (0.202, 0.302]
11        (0.5, 0.599]
12      (0.202, 0.302]
13      (0.202, 0.302]
14      (0.699, 0.798]
15      (0.103, 0.202]
16      (0.103, 0.202]
17      (0.897, 0.997]
18      (0.599, 0.699]
19    (0.00259, 0.103]
20      (0.699, 0.798]
21      (0.302, 0.401]
22        (0.5, 0.599]
23      (0.699, 0.798]
24      (0.897, 0.997]
25      (0.202, 0.302]
26      (0.897, 0.997]
27      (0.897, 0.997]
28      (0.798, 0.897]
29      (0.302, 0.401]
            ...       
70        (0.401, 0.5]
71        (0.5, 0.599]
72        (0.5, 0.599]
73      (0.103, 0.202]
74      (0.699, 0.798]
75    (0.00259, 0.103]
76      (0.202, 0.302]
77      (0.202, 0.302]
78      (0.897, 0.997]
79      (0.699, 0.798]
80        (0.5, 0.599]
81      (0.103, 0.202]
82      (0.

In [None]:
display(res)

In [4]:
df2 = pd.DataFrame({"values": np.random.rand(100)})
new_res = pd.cut(df2["values"],bins) # equal width
display(new_res)

0      (0.11, 0.208]
1       (0.701, 0.8]
2     (0.504, 0.603]
3       (0.8, 0.899]
4     (0.504, 0.603]
5       (0.8, 0.899]
6     (0.406, 0.504]
7       (0.8, 0.899]
8     (0.0103, 0.11]
9     (0.899, 0.997]
10    (0.899, 0.997]
11    (0.307, 0.406]
12    (0.307, 0.406]
13    (0.504, 0.603]
14      (0.701, 0.8]
15    (0.0103, 0.11]
16      (0.701, 0.8]
17    (0.406, 0.504]
18    (0.307, 0.406]
19      (0.8, 0.899]
20     (0.11, 0.208]
21    (0.208, 0.307]
22    (0.406, 0.504]
23    (0.406, 0.504]
24      (0.701, 0.8]
25    (0.406, 0.504]
26    (0.307, 0.406]
27     (0.11, 0.208]
28    (0.208, 0.307]
29    (0.307, 0.406]
           ...      
70     (0.11, 0.208]
71      (0.8, 0.899]
72    (0.603, 0.701]
73    (0.504, 0.603]
74    (0.406, 0.504]
75    (0.406, 0.504]
76    (0.307, 0.406]
77      (0.8, 0.899]
78    (0.307, 0.406]
79    (0.406, 0.504]
80    (0.307, 0.406]
81     (0.11, 0.208]
82    (0.504, 0.603]
83      (0.8, 0.899]
84     (0.11, 0.208]
85      (0.701, 0.8]
86    (0.406,

In [5]:
res, bins = pd.qcut(df["values"],10,retbins=True,labels=list("abcdefghij"))
display(bins)

array([0.01131218, 0.1534712 , 0.30666655, 0.36972967, 0.46283631,
       0.53645228, 0.59967209, 0.67653062, 0.81122736, 0.89036246,
       0.9971294 ])

In [None]:
display(res)

In [None]:
# min-max normalization
df = pd.DataFrame({"values": np.random.randn(100)})
df.head()

In [None]:
min = df["values"].min()
display(min)

In [None]:
max = df["values"].max()
display(max)

In [None]:
df["values"] = [(x-min)/(max-min) for x in df["values"]]
df.head()

In [None]:
# z-normalization;
df = pd.DataFrame({"values": np.random.randn(100)})
df.head()

In [None]:
mean = df["values"].mean()
display(mean)

In [None]:
std = df["values"].std()
display(std)

In [None]:
df["values"] = df["values"].apply(lambda x: (x-mean)/std)
df.head()

In [None]:
# Selection of top-ranked categorical features

In [7]:
df = pd.DataFrame({"id":[1,2,3,4,5],"grade":["b","b","a","c","a"],
                   "award":["gold", "gold", "silver","bronze", "bronze"],
                   "class": [1,1,1,0,0]})
display(df)

Unnamed: 0,award,class,grade,id
0,gold,1,b,1
1,gold,1,b,2
2,silver,1,a,3
3,bronze,0,c,4
4,bronze,0,a,5


In [8]:
for col in df.columns:
    df[col] = df[col].astype("category")
res = [(col,[g.groupby("class").size().values for (n,g) in df.groupby(col)]) for col in df.columns.drop("class")]
display(res)

[('award',
  [array([2, 0], dtype=int64),
   array([0, 2], dtype=int64),
   array([0, 1], dtype=int64)]),
 ('grade',
  [array([1, 1], dtype=int64),
   array([0, 2], dtype=int64),
   array([1, 0], dtype=int64)]),
 ('id',
  [array([0, 1], dtype=int64),
   array([0, 1], dtype=int64),
   array([0, 1], dtype=int64),
   array([1, 0], dtype=int64),
   array([1, 0], dtype=int64)])]

In [9]:
def score(values): # simple scoring function
    return np.max([np.max(v)-np.min(v) for v in values])

In [10]:
scores = [(col,score(r)) for (col,r) in res]
display(scores)

[('award', 2), ('grade', 2), ('id', 1)]

In [11]:
sorted_scores = sorted(scores,key=lambda tup: tup[1],reverse=True)
display(sorted_scores)

[('award', 2), ('grade', 2), ('id', 1)]

In [12]:
filtered = [col for (col,score) in sorted_scores[:2]]
display(filtered)

['award', 'grade']

In [13]:
new_df = df.loc[:,filtered]
display(new_df)

Unnamed: 0,award,grade
0,gold,b
1,gold,b
2,silver,a
3,bronze,c
4,bronze,a
