In [119]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go 

In [120]:
# Dataset laden

config_path = os.path.join("..", "data", "final_dataset.csv")
data = pd.read_csv(config_path)

In [121]:
data.head()

Unnamed: 0,Breed,type,score,popularity ranking,size,intelligence,congential ailments,score for kids,size.1,$LIFETIME COST,...,LONGEVITY(YEARS),NUMBER OF GENETIC AILMENTS,GENETIC AILMENTS,PURCHASE PRICE,FOOD COSTS PER YEAR,GROOMING FREQUNCY,SUITABILITY FOR CHILDREN,origin_de,Breed_de,type_de
0,Border Terrier,terrier,3.61,61,1,Above average,none,4.99,small,"$22,638",...,14.0,0,none,$833,$324,Once a week,1,Vereinigtes Königreich,Border Terrier,Terrier
1,Cairn Terrier,terrier,3.53,48,1,Above average,"'lion jaw', heart problems",4.91,small,"$21,992",...,13.84,2,"'lion jaw', heart problems",$435,$324,Once a week,1,Schottland,Cairn Terrier,Terrier
2,Siberian Husky,working,3.22,16,2,Average,none,4.72,medium,"$22,049",...,12.58,0,none,$650,$466,Once in a few weeks,1,Sibirien,Sibirischer Husky,Gebrauchshund
3,Welsh Springer Spaniel,sporting,3.34,81,2,Above average,hip problems,4.71,medium,"$20,224",...,12.49,1,hip problems,$750,$324,Once a week,1,Wales,Walisischer Springer Spaniel,Jagdhund
4,English Cocker Spaniel,sporting,3.33,51,2,Excellent,none,4.7,medium,"$18,993",...,11.66,0,none,$800,$324,Once a week,1,England,Englischer Cocker Spaniel,Jagdhund


# Lebenserwartung vs Popularität

In [122]:
correlation = data['LONGEVITY(YEARS)'].corr(data['popularity ranking'])
bestimmtheit = correlation**2
print(f"Korrelation: {correlation:.3f}, Bestimmtheitsmaß {bestimmtheit}")

Korrelation: 0.060, Bestimmtheitsmaß 0.003575555568841845


In [123]:
fig = px.scatter(
    data,
    x='LONGEVITY(YEARS)',
    y='popularity ranking',
    hover_name='Breed',
    labels={"LONGEVITY(YEARS)": "Lebenserwartung (Jahre)", "popularity ranking": "Popularitäts-Ranking (Platz 1-87)"},
    title='Zusammenhang von Lebenserwartung und Popularität',
    color='popularity ranking',
    color_continuous_scale='Viridis_r'
)

fig.update_layout(
    font=dict(
        family="Trebuchet MS",
        size=15,
        color="#2c3272",
    ),
    title_font=dict(
        family="Trebuchet MS",
        size=25,
        color="#2c3272" 
    ),
    title_x=0.5,
    title_xanchor='center'
)

# Weiße Umrandung um die Marker
fig.update_traces(
    marker=dict(
        size = 10,
        line=dict(
            width=1  # Breite der Umrandung in Pixeln
        )
    )
)


fig.update_xaxes(
    tickmode="linear",
    dtick=1,
    range=[int(data["LONGEVITY(YEARS)"].min()) - 1, int(data["LONGEVITY(YEARS)"].max()) + 1],
)

fig.update_yaxes(
    autorange='reversed',
    tickmode='array',
    dtick=10,
    tickvals=[1, 10, 20, 30, 40, 50, 60, 70, 80]
)

fig.update_coloraxes(showscale=False)

fig.add_annotation(
    text=f"Korrelation: {correlation:.4f}<br>Bestimmtheitsmaß: {bestimmtheit:.4f}",
    x=1.0,  # Position relativ zur Plot-Breite (0-1)
    y=0.02,  # Position relativ zur Plot-Höhe (0-1)
    xref="paper",
    yref="paper",
    showarrow=False,
    font=dict(
        family="Trebuchet MS",
        size=14,
        color="#2c3272"
    ),
    bgcolor="rgba(255,255,255,0.8)",  # Halbtransparenter weißer Hintergrund
    bordercolor="#2c3272",
    borderwidth=1,
    align="right",
    xanchor="right",  # Textbox wird rechts verankert
    yanchor="bottom"  # Textbox wird unten verankert
)

fig.show()

Lebenserwartung und Beliebtheit statistisch in einem schwachen Zusammenhang miteinander (±0.1-0.3 = schwacher Zusammenhang
±0.3-0.5 = mittlerer Zusammenhang)

# Hundegruppe vs Popularität

In [124]:

fig = px.scatter(
    data,
    x='type',
    y='popularity ranking',
    hover_name='Breed',
    labels={"type": "Hundegruppe", "popularity ranking": "Popularitäts-Ranking (Platz 1-87)"},
    title='Zusammenhang von Hundegruppe und Popularität',
    color='popularity ranking',
    color_continuous_scale='Viridis_r'
)

fig.update_layout(
    font=dict(
        family="Trebuchet MS",
        size=15,
        color="#2c3272",
    ),
    title_font=dict(
        family="Trebuchet MS",
        size=25,
        color="#2c3272" 
    ),
    title_x=0.5,
    title_xanchor='center'
)

# Weiße Umrandung um die Marker
fig.update_traces(
    marker=dict(
        size = 10,
        line=dict(
            width=1  # Breite der Umrandung in Pixeln
        )
    )
)


fig.update_yaxes(
    autorange='reversed',
    tickmode='array',
    dtick=10,
    tickvals=[1, 10, 20, 30, 40, 50, 60, 70, 80]
)

fig.update_coloraxes(showscale=False)


fig.show()

# Krankheiten

In [125]:
# krankheiten in hover

# vielleicht ein dropdown für je eine hundeart und dann als darstellung die krankheit als print?

In [126]:
data['GENETIC AILMENTS'].unique()

array(['none', "'lion jaw', heart problems  ", 'hip problems',
       'eye, skin problems', 'dry eye',
       'hip, eye, skin problems; enzyme deficiency',
       'eye problems, deafness, skin + heart problems, blood clotting disorders',
       "'lion jaw', 'dry eye', skin problems", 'elbows, hips, eyes',
       'hip, eye problems', 'eye problems', 'elbows, hips, eyes, heart ',
       'cataracts, hair loss, heart, eye, blood clotting disorders',
       'no data', 'deafness, hip problems', "'dry eye'",
       'liver, sinus problems', 'heart problems',
       'eye problems, deafness', 'breathing problems',
       'liver, eye problems', 'heart, spinal problems',
       'deafness, urinary stones', 'epilepsy, eye problems',
       'blood, skin disorders', 'fatal stomach bloat, skin disorder',
       'cataracts + other eye problems, connective tissue, nerves, kidneys, spine, blood clotting disorders',
       'blood vessel disorders', 'skin, spinal problems', 'knee problems',
       'heart, s

In [127]:
def simplify_ailments(ailment_string):
    if ailment_string in ['none', 'no data']:
        return 'Keine/Unbekannt'
    elif 'eye' in ailment_string.lower():
        return 'Augenprobleme'
    elif 'hip' in ailment_string.lower():
        return 'Hüftprobleme'
    elif 'heart' in ailment_string.lower():
        return 'Herzprobleme'
    elif 'skin' in ailment_string.lower():
        return 'Hautprobleme'
    else:
        return 'Andere'

data['Simplified_Ailments'] = data['GENETIC AILMENTS'].apply(simplify_ailments)

# Verteilung anzeigen
print(data['Simplified_Ailments'].value_counts())

Simplified_Ailments
Hüftprobleme       24
Keine/Unbekannt    23
Augenprobleme      21
Herzprobleme        8
Andere              7
Hautprobleme        4
Name: count, dtype: int64


In [128]:
# Häufigkeiten zählen
ailment_counts = data['Simplified_Ailments'].value_counts()

# Pie Chart erstellen
fig = px.pie(
    values=ailment_counts.values,
    names=ailment_counts.index,
    title='Verteilung der genetischen Krankheiten bei Hunderassen',
    template='plotly_dark'
)

# Styling anpassen
fig.update_layout(
    font=dict(
        family="Verdana",
        size=14,
        color="white"
    ),
    title_font=dict(
        family="Verdana",
        size=20,
        color="white"
    ),
    title_x=0.5,
    title_xanchor='center'
)

# Prozentwerte und Anzahl anzeigen
fig.update_traces(
    textposition='inside',
    textinfo='percent+label',
    hovertemplate='<b>%{label}</b><br>' +
                  'Anzahl: %{value}<br>' +
                  'Anteil: %{percent}<br>' +
                  '<extra></extra>',
    marker=dict(
        line=dict(color='white', width=2)
    )
)

fig.show()

In [129]:
# Häufigkeiten zählen und nach Häufigkeit sortieren
ailment_counts = data['Simplified_Ailments'].value_counts()

# Bar Chart erstellen
fig = px.bar(
    x=ailment_counts.index,
    y=ailment_counts.values,
    title='Verteilung der genetischen Krankheiten bei Hunderassen',
    labels={'x': 'Krankheitskategorie', 'y': 'Anzahl Rassen'},
    template='plotly_dark',
    color=ailment_counts.values,
    color_continuous_scale='Viridis'
)

# Styling anpassen
fig.update_layout(
    font=dict(
        family="Verdana",
        size=14,
        color="white"
    ),
    title_font=dict(
        family="Verdana",
        size=20,
        color="white"
    ),
    title_x=0.5,
    title_xanchor='center',
    showlegend=False
)

# Balken mit weißer Umrandung
fig.update_traces(
    marker=dict(
        line=dict(color='white', width=1)
    ),
    # Hover-Informationen
    hovertemplate='<b>%{x}</b><br>' +
                  'Anzahl: %{y} Rassen<br>' +
                  'Anteil: %{customdata:.1f}%<br>' +
                  '<extra></extra>',
    customdata=[(count / len(data)) * 100 for count in ailment_counts.values]
)

# Achsen anpassen
fig.update_xaxes(
    tickangle=45,
    title_font=dict(size=16, color="white"),
    tickfont=dict(size=12, color="white")
)

fig.update_yaxes(
    title_font=dict(size=16, color="white"),
    tickfont=dict(size=12, color="white")
)

# Farbskala ausblenden
fig.update_coloraxes(showscale=False)

fig.show()

# Zusätzliche Statistiken ausgeben
print("=== VERTEILUNG DER KRANKHEITEN ===")
total = len(data)
for category, count in ailment_counts.items():
    percentage = (count / total) * 100
    print(f"{category}: {count} Rassen ({percentage:.1f}%)")

=== VERTEILUNG DER KRANKHEITEN ===
Hüftprobleme: 24 Rassen (27.6%)
Keine/Unbekannt: 23 Rassen (26.4%)
Augenprobleme: 21 Rassen (24.1%)
Herzprobleme: 8 Rassen (9.2%)
Andere: 7 Rassen (8.0%)
Hautprobleme: 4 Rassen (4.6%)


In [130]:
# Vorbereitung der Daten
data['Simplified_Ailments'] = data['GENETIC AILMENTS'].apply(simplify_ailments)

# Erstellen des Scatter-Plots
fig = px.scatter(
    data,
    x='Simplified_Ailments',  # Verwenden Sie die kategorisierten Krankheiten
    y='popularity ranking',
    hover_name='Breed',
    labels={
        "Simplified_Ailments": "Häufige Krankheiten", 
        "popularity ranking": "Popularitäts-Ranking (1 = beliebtester)"
    },
    title='Zusammenhang zwischen Hundekrankheiten und Rassenpopularität',
    color='popularity ranking',  # Farbe nach Krankheitskategorie
    category_orders={"Simplified_Ailments": ["Keine/Unbekannt", "Augenprobleme", "Hüftprobleme", 
                                           "Herzprobleme", "Hautprobleme", "Andere"]}
)

# Layout-Anpassungen
fig.update_layout(
    font=dict(
        family="Trebuchet MS",
        size=12,
        color="#2c3272",
    ),
    title_font=dict(
        family="Trebuchet MS",
        size=20,
        color="#2c3272" 
    ),
    title_x=0.5,
    xaxis_title="Krankheitskategorie",
    yaxis_title="Popularitäts-Ranking (1-87)",
    showlegend=True,
    legend_title_text="Krankheitskategorie"
)

# Y-Achse anpassen (höhere Zahlen = weniger populär)
fig.update_yaxes(
    autorange='reversed',
    tickmode='array',
    tickvals=list(range(1, 88, 5))
)

# Marker anpassen
fig.update_traces(
    marker=dict(
        size=12,
        opacity=0.7,
        line=dict(width=1, color='DarkSlateGrey')
    )
)

fig.show()

In [131]:
fig = px.box(
    data,
    x='Simplified_Ailments',
    y='popularity ranking',
    color='Simplified_Ailments',
    title='Popularitäts-Ranking nach Krankheitskategorie'
)
fig.update_yaxes(autorange='reversed')

In [132]:
fig = px.violin(
    data,
    x='Simplified_Ailments',
    y='popularity ranking',
    color='Simplified_Ailments',
    box=True,
    title='Verteilung der Popularität nach Krankheitskategorie'
)
fig.update_yaxes(autorange='reversed')