In [1]:
import pandas as pd
from IPython.display import display, HTML
from itertools import combinations
from collections import Counter

In [2]:
file_path = r'C:\Users\Moritz\Desktop\DHBW CAS\Data & Web-Mining\Präsentation\pizza-daten.csv'
df = pd.read_csv(file_path)
print(df.head())

   Index                                        Komponenten  Unnamed: 2  \
0    1.0              Salami, Schinken, Pilze, Zwiebeln, Ei         NaN   
1    2.0                             Salami, Schinken, Mais         NaN   
2    3.0         Mais, Paprika, Zwiebeln, Zucchini, Tomaten         NaN   
3    4.0  Pilze, Brokkoli, Paprika, Zwiebeln, Oliven, Au...         NaN   
4    5.0                       Schinken, Pilze, Artischocke         NaN   

   Unnamed: 3  
0         NaN  
1         NaN  
2         NaN  
3         NaN  
4         NaN  


In [3]:
columns_to_remove = ['Index','Unnamed: 2','Unnamed: 3']
df.drop(columns=columns_to_remove, axis=1, inplace=True)

html = df.head(15).to_html()
display(HTML(html))

Unnamed: 0,Komponenten
0,"Salami, Schinken, Pilze, Zwiebeln, Ei"
1,"Salami, Schinken, Mais"
2,"Mais, Paprika, Zwiebeln, Zucchini, Tomaten"
3,"Pilze, Brokkoli, Paprika, Zwiebeln, Oliven, Aubergine, Sauce Hollandaise"
4,"Schinken, Pilze, Artischocke"
5,"Salami, Pilze, Brokkoli, Paprika, Zwiebeln, Tomaten"
6,"Pilze, Artischocke, Zwiebeln, Jalapenos/Peperoni, Aubergine"
7,"Salami, Zwiebeln, Tomaten"
8,"Rucola, Parma-Schinken, Parmesan"
9,"Salami, Schinken, Pilze, Jalapenos/Peperoni, Aubergine"


In [4]:
print(df['Komponenten'].dtype)

object


In [5]:
df.dropna(inplace=True)

In [6]:
df['Komponenten'] = df['Komponenten'].str.split(',')
df['Komponenten'] = df['Komponenten'].apply(lambda x: [obj.strip() for obj in x] if isinstance(x, list) else x)
html = df.head(15).to_html()
display(HTML(html))

Unnamed: 0,Komponenten
0,"[Salami, Schinken, Pilze, Zwiebeln, Ei]"
1,"[Salami, Schinken, Mais]"
2,"[Mais, Paprika, Zwiebeln, Zucchini, Tomaten]"
3,"[Pilze, Brokkoli, Paprika, Zwiebeln, Oliven, Aubergine, Sauce Hollandaise]"
4,"[Schinken, Pilze, Artischocke]"
5,"[Salami, Pilze, Brokkoli, Paprika, Zwiebeln, Tomaten]"
6,"[Pilze, Artischocke, Zwiebeln, Jalapenos/Peperoni, Aubergine]"
7,"[Salami, Zwiebeln, Tomaten]"
8,"[Rucola, Parma-Schinken, Parmesan]"
9,"[Salami, Schinken, Pilze, Jalapenos/Peperoni, Aubergine]"


In [7]:
unique_objects = set()
for obj_list in df['Komponenten']:
    unique_objects.update(obj_list)

for obj in unique_objects:
    df[obj] = df['Komponenten'].apply(lambda x: 1 if obj in x else 0)

df.drop('Komponenten', axis=1, inplace=True)
df.drop('',axis=1,inplace=True)

print(df.head(5))

   Jalapenos/Peperoni  Kapern  Pesto  Brokkoli  \
0                   0       0      0         0   
1                   0       0      0         0   
2                   0       0      0         0   
3                   0       0      0         1   
4                   0       0      0         0   

   italienische Kräuter (getrocknet)  Parmesan  Tomatensoße  Anchovis  \
0                                  0         0            0         0   
1                                  0         0            0         0   
2                                  0         0            0         0   
3                                  0         0            0         0   
4                                  0         0            0         0   

   Paprika  Artischocke  ...  Creme fraiche  Lachs  Sardellen  Ei  Zwiebeln  \
0        0            0  ...              0      0          0   1         1   
1        0            0  ...              0      0          0   0         0   
2        1            0

In [8]:
component_counts = df.sum().sort_values(ascending=False)

top_10_components = component_counts.head(10)

print(top_10_components)

Pilze                 92
Schinken              80
Zwiebeln              72
Salami                63
Mais                  52
Paprika               34
Oliven                34
Jalapenos/Peperoni    34
Rucola                32
Tomaten               28
dtype: int64


In [9]:
pair_counter = Counter()
for obj_list in df.index:
    present_components = df.columns[df.loc[obj_list] == 1]
    pairs = combinations(present_components, 2)
    pair_counter.update(pairs)

top_10_pairs = pair_counter.most_common(10)

print("\n10 am häufigsten kombinierte 2 Zutaten:")
for pair, count in top_10_pairs:
    print(f"{pair}: {count} times")


10 am häufigsten kombinierte 2 Zutaten:
('Pilze', 'Schinken'): 47 times
('Pilze', 'Zwiebeln'): 40 times
('Pilze', 'Salami'): 33 times
('Schinken', 'Salami'): 32 times
('Salami', 'Zwiebeln'): 28 times
('Pilze', 'Mais'): 28 times
('Schinken', 'Zwiebeln'): 27 times
('Paprika', 'Mais'): 22 times
('Mais', 'Zwiebeln'): 22 times
('Paprika', 'Pilze'): 21 times


In [10]:
pair_counter = Counter()
for obj_list in df.index:
    present_components = df.columns[df.loc[obj_list] == 1]
    pairs = combinations(present_components, 3)
    pair_counter.update(pairs)

top_10_pairs = pair_counter.most_common(10)

print("\n10 am häufigsten kombinierte 3 Zutaten:")
for pair, count in top_10_pairs:
    print(f"{pair}: {count} times")


10 am häufigsten kombinierte 3 Zutaten:
('Pilze', 'Schinken', 'Salami'): 21 times
('Pilze', 'Schinken', 'Zwiebeln'): 20 times
('Pilze', 'Salami', 'Zwiebeln'): 16 times
('Schinken', 'Salami', 'Zwiebeln'): 15 times
('Artischocke', 'Pilze', 'Schinken'): 13 times
('Pilze', 'Mais', 'Zwiebeln'): 13 times
('Paprika', 'Pilze', 'Zwiebeln'): 11 times
('Jalapenos/Peperoni', 'Schinken', 'Salami'): 11 times
('Paprika', 'Pilze', 'Mais'): 11 times
('Jalapenos/Peperoni', 'Schinken', 'Zwiebeln'): 11 times


In [11]:
print(df.columns.tolist())

['Jalapenos/Peperoni', 'Kapern', 'Pesto', 'Brokkoli', 'italienische Kräuter (getrocknet)', 'Parmesan', 'Tomatensoße', 'Anchovis', 'Paprika', 'Artischocke', 'Gorgonzola', 'Trüffel', 'Pilze', 'Oliven', 'Champignons', 'Kidneybohnen', 'Meeresfrüchte', 'Parma-Schinken', 'Rucola', 'Basilikum', 'Sauce Hollandaise', 'Peperoniwurst', 'Schinken', 'Getrocknete Tomaten', 'Knoblauch', 'Ananas', 'Spinat', 'Tomaten', 'Knobloch und leka sucuk', 'Zucchini', 'Mais', 'Schafskäse', 'Mozarella', 'Salami', 'Scharfe Salami', 'Aubergine', 'Veganer Käse', 'Parmaschinken', 'Creme fraiche', 'Lachs', 'Sardellen', 'Ei', 'Zwiebeln', 'Parmaschinken + Büffelmozzarella', 'Büffelmozzarella', 'Thunfisch', 'Mozzarella', 'Feta']


In [12]:
from prince import MCA

mca = MCA(n_components=10)
mca.fit(df)
mca_df = mca.transform(df)
print(mca_df.head())

          0         1         2         3         4         5         6  \
0 -0.183150 -0.146906 -0.112208 -0.116773 -0.060049 -0.110490 -0.015659   
1 -0.128150  0.127773  0.057340 -0.123959 -0.011220 -0.090604  0.034807   
2  0.412193  0.051745  0.018600  0.037129  0.111407  0.017317  0.057931   
3  0.790308 -0.297499  0.025602 -0.020691 -0.292220  0.126858  0.031645   
4 -0.152187  0.035373  0.053811 -0.112379 -0.048839 -0.092456  0.010950   

          7         8         9  
0  0.156674 -0.004990  0.213189  
1  0.019108 -0.117951 -0.027023  
2 -0.100033 -0.339748 -0.040487  
3 -0.410004  0.314677  0.545296  
4 -0.098196  0.177777 -0.013469  
