# Pandas: Missing Data, GroupBy, Merge/Join/Concat, Useful Ops (with solutions)

In [None]:

import numpy as np, pandas as pd


## Missing data: dropna / fillna

In [None]:

d = {"A":[1,2,np.nan],"B":[5,np.nan,np.nan],"C":[1,2,3]}
df = pd.DataFrame(d)
df, df.dropna(), df.dropna(axis=1), df.dropna(thresh=2)


In [None]:

# Fill with scalar
df_fill = df.fillna("FILL")
# Fill with column mean (A)
df["A"] = df["A"].fillna(df["A"].mean())
df, df_fill


## GroupBy & aggregation

In [None]:

data = {"Company":["GOOG","GOOG","MSFT","MSFT","FB","FB"],
        "Person":["Sam","Charlie","Amy","Vanessa","Carl","Sarah"],
        "Sales":[200,120,340,124,243,350]}
gdf = pd.DataFrame(data)
grp = gdf.groupby("Company")
grp.mean(numeric_only=True), grp.sum(numeric_only=True), grp.describe()


## Concatenation

In [None]:

f1 = pd.DataFrame({"A":[1,2,3],"B":[4,5,6]}, index=[0,1,2])
f2 = pd.DataFrame({"A":[4,5,6],"B":[7,8,9]}, index=[3,4,5])
pd.concat([f1,f2], axis=0)


## Merge (SQL-style) & Join (index-based)

In [None]:

left  = pd.DataFrame({"key":["K0","K1","K2","K3"], "A":[1,2,3,4], "B":[5,6,7,8]})
right = pd.DataFrame({"key":["K0","K1","K2","K3"], "C":[9,10,11,12], "D":[13,14,15,16]})
pd.merge(left, right, on="key", how="inner")


In [None]:

# Join by index
l = left.set_index("key")
r = right.set_index("key")
l.join(r, how="inner")


## Useful ops: unique / nunique / value_counts / apply / sort_values / pivot_table

In [None]:

df2 = pd.DataFrame({"col1":[1,2,2,3], "col2":[444,444,555,666], "col3":["abc","def","ghi","xyz"]})
df2["col2"].unique(), df2["col2"].nunique(), df2["col2"].value_counts()


In [None]:

# apply with lambda
df2["col1_x2"] = df2["col1"].apply(lambda x: x*2)
df2.sort_values("col2")


In [None]:

# pivot table
df_pv = pd.DataFrame({
    "A":["foo","foo","bar","bar"],
    "B":[1,2,1,2],
    "C":["x","y","x","y"],
    "D":[10,20,30,40]
})
df_pv.pivot_table(values="D", index=["A","B"], columns="C")


## Exercises (solved)

In [None]:

# E1: Drop columns that contain any NaN from df
d = {"A":[1,2,np.nan],"B":[np.nan,np.nan,3],"C":[1,2,3]}
e1 = pd.DataFrame(d)
sol_E1 = e1.dropna(axis=1)
sol_E1


In [None]:

# E2: GroupBy sum of Sales; return value for 'FB'
val_FB = gdf.groupby("Company")["Sales"].sum().loc["FB"]
val_FB


In [None]:

# E3: Merge left/right on 'key' and sort by D descending
merged = pd.merge(left, right, on="key").sort_values("D", ascending=False)
merged
