In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import json

In [3]:
# Load Datasets
superhero_info = "https://docs.google.com/spreadsheets/d/e/2PACX-1vS1ZstYLwFgwhZnqDsPjtnlHYhJp_cmW55J8JD5mym0seRsaem3px7QBtuFF0LiI7z1PLCkVKAkdO7J/pub?output=csv"
superhero_powers = "https://docs.google.com/spreadsheets/d/e/2PACX-1vSzdWOBaXOoz52vPmCFV5idNlDBohLY1Lsbc1IfZIZQ7cV_aNB2wYBfhF49uE1TaO1B5MQCGWiNrFfd/pub?output=csv"

df1 = pd.read_csv(superhero_info)
df2 = pd.read_csv(superhero_powers)

In [4]:
# Split 'Hero|Publisher' into two separate columns 'Hero' and 'Publisher'
df1[['Hero', 'Publisher']] = df1['Hero|Publisher'].str.split('|', expand=True)

# Data Preparation

In [5]:
# Convert 'Measurements' column from string representation of dictionary to actual dictionary
df1['Measurements'] = df1['Measurements'].str.replace("'", '"').apply(json.loads)

In [6]:
# Extract 'Height' and 'Weight' from 'Measurements' column and convert them to numeric
df1['Height'] = df1['Measurements'].apply(lambda x: x['Height'].split(' ')[0]).astype(float)
df1['Weight'] = df1['Measurements'].apply(lambda x: x['Weight'].split(' ')[0]).astype(float)

In [7]:
# Drop the 'Hero|Publisher' and 'Measurements' columns as we don't need them anymore
df1 = df1.drop(['Hero|Publisher', 'Measurements'], axis=1)

In [8]:
# Rename 'hero_names' to 'Hero' in df2 to match df1
df2.rename(columns={'hero_names': 'Hero'}, inplace=True)

In [9]:
# Merge df1 and df2 on 'Hero'
df = pd.merge(df1, df2, on='Hero')

In [10]:
# Convert 'Powers' into separate one-hot-encoded columns
powers_df = df['Powers'].str.get_dummies(sep=',')
df = pd.concat([df, powers_df], axis=1)

In [11]:
# Drop the 'Powers' column as we don't need it anymore
df = df.drop(['Powers'], axis=1)

# Now to answer the questions:


# 1. Compare the average weight of superheroes who have Super Speed to those who do not.

In [13]:
avg_weight_with_superspeed = df[df['Super Speed'] == 1]['Weight'].mean()
avg_weight_without_superspeed = df[df['Super Speed'] == 0]['Weight'].mean()

print(f'Average weight with Super Speed: {avg_weight_with_superspeed}')
print(f'Average weight without Super Speed: {avg_weight_without_superspeed}')

Average weight with Super Speed: 129.40404040404042
Average weight without Super Speed: 101.77358490566037


# 2. What is the average height of heroes for each publisher?

In [14]:
avg_height_per_publisher = df.groupby('Publisher')['Height'].mean()
print(avg_height_per_publisher)

Publisher
DC Comics            181.923913
Dark Horse Comics    176.909091
George Lucas         159.600000
Image Comics         211.000000
Marvel Comics        191.546128
Shueisha             171.500000
Star Trek            181.500000
Team Epic TV         180.750000
Unknown              178.000000
Name: Height, dtype: float64
