From https://stackoverflow.com/a/69604744

In [1]:
import pandas as pd
# import markdown


Create a data frame with two columns, the seconds of which has nested lists, sometimes with duplicated entries.

Notice that "apple" appears twice in the second column of the first row.

In [2]:
data = pd.DataFrame()
data['id'] = ["ab3e3", "psdds2", "pas13", "ccdf2", "dsda1"]
data['fruit'] = ["apple, orange, apple", "others", "dragon fruit, orange", "watermelon", "others"]

df = pd.DataFrame(data)
df


Unnamed: 0,id,fruit
0,ab3e3,"apple, orange, apple"
1,psdds2,others
2,pas13,"dragon fruit, orange"
3,ccdf2,watermelon
4,dsda1,others


# Using .get_dummies() with a delimeter

Expand column categories into individual columns with binary values.

In [3]:
df['fruit'].str.get_dummies(', ')


Unnamed: 0,apple,dragon fruit,orange,others,watermelon
0,1,0,1,0,0
1,0,0,0,1,0
2,0,1,1,0,0
3,0,0,0,0,1
4,0,0,0,1,0


In [4]:
df['fruit'].str.get_dummies(', ').sum()


Unnamed: 0,0
apple,1
dragon fruit,1
orange,2
others,2
watermelon,1


Join original data frame with get_dummies.

In [5]:
df.join(df['fruit'].str.get_dummies(', '))


Unnamed: 0,id,fruit,apple,dragon fruit,orange,others,watermelon
0,ab3e3,"apple, orange, apple",1,0,1,0,0
1,psdds2,others,0,0,0,1,0
2,pas13,"dragon fruit, orange",0,1,1,0,0
3,ccdf2,watermelon,0,0,0,0,1
4,dsda1,others,0,0,0,1,0


Drop "fruit" - version 1.

In [6]:
df.join(df['fruit'].str.get_dummies(', ')).drop('fruit', axis = 1 )


Unnamed: 0,id,apple,dragon fruit,orange,others,watermelon
0,ab3e3,1,0,1,0,0
1,psdds2,0,0,0,1,0
2,pas13,0,1,1,0,0
3,ccdf2,0,0,0,0,1
4,dsda1,0,0,0,1,0


Drop "fruit" - version 2.

In [7]:
df.drop('fruit', axis=1).join(df['fruit'].str.get_dummies(', '))


Unnamed: 0,id,apple,dragon fruit,orange,others,watermelon
0,ab3e3,1,0,1,0,0
1,psdds2,0,0,0,1,0
2,pas13,0,1,1,0,0
3,ccdf2,0,0,0,0,1
4,dsda1,0,0,0,1,0


# Using .explode()

In [8]:
df["fruit"].str.split(", ")


Unnamed: 0,fruit
0,"[apple, orange, apple]"
1,[others]
2,"[dragon fruit, orange]"
3,[watermelon]
4,[others]


In [12]:
df.assign(foobar=df["fruit"].str.split(", "))


Unnamed: 0,id,fruit,foobar
0,ab3e3,"apple, orange, apple","[apple, orange, apple]"
1,psdds2,others,[others]
2,pas13,"dragon fruit, orange","[dragon fruit, orange]"
3,ccdf2,watermelon,[watermelon]
4,dsda1,others,[others]


In [None]:
# almost the same as this ...
# df['foobar'] = df["fruit"].str.split(", ")
# df

In [13]:
df

Unnamed: 0,id,fruit
0,ab3e3,"apple, orange, apple"
1,psdds2,others
2,pas13,"dragon fruit, orange"
3,ccdf2,watermelon
4,dsda1,others


In [10]:
df.assign(fruit=df["fruit"].str.split(", ")).explode("fruit")


Unnamed: 0,id,fruit
0,ab3e3,apple
0,ab3e3,orange
0,ab3e3,apple
1,psdds2,others
2,pas13,dragon fruit
2,pas13,orange
3,ccdf2,watermelon
4,dsda1,others


In [11]:
ct = df.assign(fruit=df.fruit.str.split(", ")).explode("fruit").drop(columns=["id"])
ct

Unnamed: 0,fruit
0,apple
0,orange
0,apple
1,others
2,dragon fruit
2,orange
3,watermelon
4,others


In [14]:
ct.value_counts()

Unnamed: 0_level_0,count
fruit,Unnamed: 1_level_1
apple,2
orange,2
others,2
dragon fruit,1
watermelon,1


In [20]:
df.assign(fruit=df.fruit.str.split(", ")).explode("fruit").value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
id,fruit,Unnamed: 2_level_1
ab3e3,apple,2
ab3e3,orange,1
ccdf2,watermelon,1
dsda1,others,1
pas13,dragon fruit,1
pas13,orange,1
psdds2,others,1


# Using .crosstab()

In [15]:
pd.crosstab(ct.index, ct["fruit"])


fruit,apple,dragon fruit,orange,others,watermelon
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2,0,1,0,0
1,0,0,0,1,0
2,0,1,1,0,0
3,0,0,0,0,1
4,0,0,0,1,0


In [16]:
pd.crosstab(ct.index, ct["fruit"]).rename_axis(None, axis=1)


Unnamed: 0_level_0,apple,dragon fruit,orange,others,watermelon
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2,0,1,0,0
1,0,0,0,1,0
2,0,1,1,0,0
3,0,0,0,0,1
4,0,0,0,1,0


In [17]:
pd.crosstab(ct.index, ct["fruit"]).rename_axis(None, axis=1).rename_axis(None, axis=0)


Unnamed: 0,apple,dragon fruit,orange,others,watermelon
0,2,0,1,0,0
1,0,0,0,1,0
2,0,1,1,0,0
3,0,0,0,0,1
4,0,0,0,1,0


In [18]:
df.join(pd.crosstab(ct.index, ct["fruit"]).rename_axis(None, axis=1).rename_axis(None, axis=0))


Unnamed: 0,id,fruit,apple,dragon fruit,orange,others,watermelon
0,ab3e3,"apple, orange, apple",2,0,1,0,0
1,psdds2,others,0,0,0,1,0
2,pas13,"dragon fruit, orange",0,1,1,0,0
3,ccdf2,watermelon,0,0,0,0,1
4,dsda1,others,0,0,0,1,0
