In [1]:
!pip install altair




In [2]:
import pandas as pd
import numpy as np
import os,json
import math

import altair as alt
alt.renderers.enable('default')
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

# Datasets Transformation

## Getting Dataset for Character Distribution

In [3]:
df_info = pd.read_csv("Data/superhero_info - superhero_info(1).csv")

In [4]:
## separate 'Hero|Publisher'column into "Hero"and "publisher"columns and save the 2 new columns into the dataframe
df_info[['Hero','Publisher']] = df_info['Hero|Publisher'].str.split('|',expand=True)
#drop the original column
df_info = df_info.drop(columns=['Hero|Publisher'])

#covert the the string column(measurements) into actual dictionaries
## use .str.replace to replace all single quotes
df_info['Measurements'] = df_info['Measurements'].str.replace("'",'"')
## Apply the json.loads to the full column
df_info['Measurements'] = df_info['Measurements'].apply(json.loads)


#unpack the measurements column of dictionaries into separate columns
Hei_wei = df_info['Measurements'].apply(pd.Series)


#add the Hei_wei dataframe with the original dataframe and drop the measurements column
df_info = pd.concat((df_info, Hei_wei), axis = 1)
df_info = df_info.drop(columns = ['Measurements'])

#height column
df_info[['Height(cm)','cm']] = df_info['Height'].str.split(' ',expand=True)

df_info = df_info.drop(columns=['Height'])
df_info = df_info.drop(columns=['cm'])

#weight column
df_info[['Weight(kg)','kg']] = df_info['Weight'].str.split(' ',expand=True)

df_info = df_info.drop(columns=['Weight'])
df_info = df_info.drop(columns=['kg'])

df_info = df_info.reindex(columns=['Hero', 'Publisher', 'Gender', 'Eye color', 'Race', 'Hair color', 'Height(cm)', 'Weight(kg)','Skin color', 'Alignment'])

df_info['Weight(kg)'] = pd.to_numeric(df_info['Weight(kg)'])
df_info['Height(cm)'] = pd.to_numeric(df_info['Height(cm)'])


# Define the thresholds for each category
threshold_very_high = 300
threshold_high = 200
threshold_medium = 100

# Encode the "Height(cm)" column into categories
df_info['Height Category'] = pd.cut(df_info['Height(cm)'],
                                    bins=[0, threshold_medium, threshold_high, threshold_very_high, float('inf')],
                                    labels=['short', 'medium', 'high', 'very high'])


# Define the thresholds for each category
threshold_very_heavy = 200
threshold_heavy = 100
threshold_medium = 50

# Encode the "Weight(kg)" column into categories
df_info['Weight Category'] = pd.cut(df_info['Weight(kg)'],
                                    bins=[0, threshold_medium, threshold_heavy, threshold_very_heavy, float('inf')],
                                    labels=['light', 'medium', 'heavy', 'very heavy'])

df_info = df_info.drop(columns=['Height(cm)', 'Weight(kg)'])

In [5]:
df_info.head()

Unnamed: 0,Hero,Publisher,Gender,Eye color,Race,Hair color,Skin color,Alignment,Height Category,Weight Category
0,A-Bomb,Marvel Comics,Male,yellow,Human,No Hair,Unknown,good,high,very heavy
1,Abe Sapien,Dark Horse Comics,Male,blue,Icthyo Sapien,No Hair,blue,good,medium,medium
2,Abin Sur,DC Comics,Male,blue,Ungaran,No Hair,red,good,medium,medium
3,Abomination,Marvel Comics,Male,green,Human / Radiation,No Hair,Unknown,bad,high,very heavy
4,Absorbing Man,Marvel Comics,Male,blue,Human,No Hair,Unknown,bad,medium,heavy


## Getting Data for Superpower Distribution and Publisher Analysis

In [6]:
df_power = pd.read_csv('Data/superhero_powers - superhero_powers(1).csv')

In [7]:
#transfor string of powers to list of power
def Convert(string):
    li = list(string.split(","))
    return li

df_power['Powers_split'] = df_power['Powers'].apply(Convert)

## exploding the column of lists
exploded = df_power.explode('Powers_split')

## saving the unique values from the exploded column
cols_to_make = exploded['Powers_split'].dropna().unique()

for col in cols_to_make:
    df_power[col] = df_power['Powers'].str.contains(col)
df_power = df_power.drop(columns = ['Powers', 'Powers_split'])

df_power = df_power.set_index('hero_names')
df_power = df_power.reindex(index=df_info['Hero'])
df_power = df_power.reset_index()


  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power[col] = df_power['Powers'].str.contains(col)
  df_power

In [8]:
cols_to_remove = ['Gender', 'Eye color', 'Race', 'Hair color', 'Alignment', 'Skin color', 'Height Category', 'Weight Category' ]
df_to_combine = df_info.drop(columns=cols_to_remove)


In [9]:
final_df = pd.merge(df_to_combine, df_power, left_on="Hero", right_on="Hero", how="inner")


In [10]:
#drop the super powers with 0 counts
superpower_counts = final_df.iloc[:, 4:].sum()
zero_count_superpowers = superpower_counts[superpower_counts == 0].index
visual_df = final_df.drop(columns=zero_count_superpowers)

In [11]:
visual_df.head()

Unnamed: 0,Hero,Publisher,Agility,Super Strength,Stamina,Super Speed,Accelerated Healing,Durability,Longevity,Camouflage,...,Photographic Reflexes,Anti-Gravity,Power Nullifier,Weather Control,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Melting,Changing Armor
0,A-Bomb,Marvel Comics,False,True,True,False,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,Abe Sapien,Dark Horse Comics,True,True,True,False,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
2,Abin Sur,DC Comics,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Abomination,Marvel Comics,False,True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Absorbing Man,Marvel Comics,False,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


# Getting Visualizations

## Distribution of Characters

In [12]:
def plot_distribution(data, column):
    sorted_values = data[column].value_counts().reset_index().sort_values(column, ascending=False)
    
    chart = alt.Chart(sorted_values).mark_bar().encode(
        x=alt.X('index', sort=alt.EncodingSortField(field=column, op='count', order='descending')),
        y=column + ':Q',
        tooltip=[alt.Tooltip('index'), alt.Tooltip(column, title='Count')],
        color=alt.Color('index', scale=alt.Scale(scheme='category20'))
    ).properties(
        width=500,
        height=200,
        title=f'{column} Distribution'
    )
    
    return chart

In [13]:
#Gender distribution plot_distribution(df_info, "Gender")
gender_chart = plot_distribution(df_info, "Gender")
eyecolor_chart = plot_distribution(df_info, "Eye color")
race_chart = plot_distribution(df_info, "Race")
haircolor_chart = plot_distribution(df_info, "Hair color")
skincolor_chart = plot_distribution(df_info, "Skin color")
height_chart = plot_distribution(df_info, "Height Category")
weight_chart = plot_distribution(df_info, "Weight Category")
alignment_chart = plot_distribution(df_info, "Alignment")

In [14]:
# Concatenate the charts vertically
row1 = alt.hconcat(gender_chart, eyecolor_chart, race_chart)
row2 = alt.hconcat(haircolor_chart, skincolor_chart, alignment_chart)
row3 = alt.hconcat(weight_chart, height_chart)

# Concatenate the rows horizontally
grid = alt.vconcat(row1, row2, row3)
grid.properties(
        # add a title
        title={
            "text": ["Character Distribution"],
            "subtitle": ["Gender, Eye Color, Race, Hair Color, Skin Color, Height, Weight, Alignment"],
            "fontSize": 30
        }
        
    
    ).configure(
    background='#DCDCDC',
).configure_axis(
    labelFontSize=10,
    labelFont='Courier',
    titleFontSize=20,
    titleFont='Helvetica',
    gridColor='white'
)

## Distribution of Superpowers

In [15]:
# Remove irrelevant columns
df1 = visual_df.drop(['Hero', 'Publisher'], axis=1)

# Calculate the frequency of each superpower
superpower_counts = df1.sum().reset_index()
superpower_counts.columns = ['Superpower', 'Count']
# Sort the superpowers by frequency and reset the index
sorted_superpowers = superpower_counts.sort_values('Count', ascending=False).reset_index(drop=True)

In [16]:
# Define the count bins
bins = [0, 5, 25, 100, float('inf')]
labels = ['Below 5', 'Between 5-25', 'Between 25-100', 'Above 100']

# Assign the count bins to a new column
superpower_counts['Count Category'] = pd.cut(superpower_counts['Count'], bins=bins, labels=labels)

# Split the superpowers into four parts based on the count category
superpowers_part1 = superpower_counts[superpower_counts['Count Category'] == 'Above 100']
superpowers_part2 = superpower_counts[superpower_counts['Count Category'] == 'Between 25-100']
superpowers_part3 = superpower_counts[superpower_counts['Count Category'] == 'Between 5-25']
superpowers_part4 = superpower_counts[superpower_counts['Count Category'] == 'Below 5']


In [17]:
# Create the charts for each part of superpowers
chart_part1 = alt.Chart(superpowers_part1).mark_bar().encode(
    x='Count:Q',
    y=alt.Y('Superpower:N', sort='-x'),
    color=alt.Color('Superpower:N', legend=None)
).properties(
    width=300,
    height=600,
    title='Distribution of Superpowers (Above 100)'
)

chart_part2 = alt.Chart(superpowers_part2).mark_bar().encode(
    x='Count:Q',
    y=alt.Y('Superpower:N', sort='-x'),
    color=alt.Color('Superpower:N', legend=None)
).properties(
    width=300,
    height=600,
    title='Distribution of Superpowers (Between 25-100)'
)

chart_part3 = alt.Chart(superpowers_part3).mark_bar().encode(
    x='Count:Q',
    y=alt.Y('Superpower:N', sort='-x'),
    color=alt.Color('Superpower:N', legend=None)
).properties(
    width=300,
    height=600,
    title='Distribution of Superpowers (Between 5-25)'
)

chart_part4 = alt.Chart(superpowers_part4).mark_bar().encode(
    x='Count:Q',
    y=alt.Y('Superpower:N', sort='-x'),
    color=alt.Color('Superpower:N', legend=None)
).properties(
    width=300,
    height=600,
    title='Distribution of Superpowers (Below 5)'
)

# Combine the charts using facets
row1 = alt.hconcat(chart_part1, chart_part2)
row2 = alt.hconcat(chart_part3, chart_part4)

combined_chart = alt.vconcat(row1, row2)

# Display the combined chart
combined_chart.properties(
        # add a title
        title={
            "text": ["Superpower Distribution"],
            "fontSize": 30
        }
        
    
    ).configure(
    background='#DCDCDC',
).configure_axis(
    labelFontSize=10,
    labelFont='Courier',
    titleFontSize=20,
    titleFont='Helvetica',
    gridColor='white'
)

## Publisher Analysis

In [18]:
# Count the number of heroes for each publisher
heroes_per_publisher = visual_df.groupby('Publisher')['Hero'].nunique().reset_index()
heroes_per_publisher = heroes_per_publisher.sort_values('Hero', ascending=False).reset_index(drop=True)
heroes_per_publisher = heroes_per_publisher.rename(columns={'Hero': 'Superhero Counts'})


In [19]:

chart2 = alt.Chart(heroes_per_publisher).mark_bar().encode(
    y=alt.Y('Publisher:N', sort='-x', axis=None),
    x='Superhero Counts:Q',
    tooltip=['Publisher', 'Superhero Counts'],
    color=alt.Color('Publisher:N', legend=None)
).properties(
    width=400,
    height=300
    
)


# Add text labels to the bar chart
text = chart2.mark_text(
    align='left',
    baseline='middle',
    dx=3  
).encode(
    text='Superhero Counts:Q'
)


chart2 = chart2 + text

In [20]:
 #Count the number of unique superpowers for each publisher
superpower_counts = visual_df.groupby('Publisher').nunique().iloc[:, 4:].sum(axis=1)

# Create a DataFrame with the publisher names and superpower counts
publisher_counts = pd.DataFrame({'Publisher': superpower_counts.index, 'Superpower Count': superpower_counts.values})

# Sort the DataFrame by superpower count in descending order
publisher_counts = publisher_counts.sort_values('Superpower Count', ascending=False)


In [21]:
# Create the bar chart using Altair
chart3 = alt.Chart(publisher_counts).mark_bar().encode(
    x='Superpower Count:Q',
    y=alt.Y('Publisher:N', sort='-x'),
    color=alt.Color('Publisher:N', legend=None),
    tooltip=['Publisher', 'Superpower Count']
)


text = chart3.mark_text(
    align='left',
    baseline='middle',
    dx=3
).encode(
    text='Superpower Count:Q'
)

chart3 = (chart3 + text).properties(
    width=400,
    height=300
    
)

In [22]:
combined_data = heroes_per_publisher.merge(publisher_counts, on='Publisher')

melted_data = combined_data.melt(id_vars='Publisher', value_vars=['Superhero Counts', 'Superpower Count'], var_name='Variable', value_name='Value')


In [23]:
bar_chart = alt.Chart(melted_data).mark_bar().encode(
    x=alt.X('Variable:N', axis=None),
    y=alt.Y('Value:Q', axis=alt.Axis(title='Superheroes & Superpowers (count)')),
    color='Variable:N',
    tooltip=['Variable', 'Value']
).properties(
    width=80,
    height=180
).facet(
    column='Publisher',
    spacing=0
)


In [24]:
Allcharts = (chart3 | chart2) & bar_chart
Allcharts.properties(
        # add a title
        title={
            "text": ["Publisher Analysis"],
            "subtitle": ["Based on the Count of Superpowers and Superheros"],
            "fontSize": 30
        }
        
    
    ).configure(
    background='#DCDCDC',
).configure_axis(
    labelFontSize=10,
    labelFont='Courier',
    titleFontSize=20,
    titleFont='Helvetica',
    gridColor='white'
)