In [None]:
#installing packages
!pip install numpy==1.16.4
!pip install pandas==0.24.2
!pip install plotly==4.1.0
!pip install plotly-geo==1.0.0

!pip install scikit-learn==0.21.2
!pip install scipy==1.3.0
!pip install statsmodels

In [None]:
#For Cleaning and Exploratory Data Analysis
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm
from scipy import stats

#For Plotting
#import seaborn as sns
#import matplotlib.pyplot as plt
#from matplotlib.pyplot import plot, show, draw, figure
#from mpl_toolkits.mplot3d import Axes3D

#For Plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

#For Dimensionality Reduction and K-Means
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.cluster.vq import kmeans, whiten, vq
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#For Cluster Analysis
#import math

#For Decision Tree
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.tree import DecisionTreeClassifier
#from sklearn import metrics
#from sklearn.model_selection import train_test_split
#from sklearn import tree
#from sklearn.tree import export_graphviz
#from sklearn.externals.six import StringIO  
#from IPython.display import Image  
#import pydotplus

#For Hierarchical Clustering
import plotly.figure_factory as ff
from scipy.cluster.hierarchy import linkage, dendrogram

#Settings
#import warnings
#warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('superheroes.csv')

In [None]:
#drop unnecessary columns
df_dropped = df.drop(df.columns[[1, 2, 4, 5, 12, 13, 14, 15, 16, 19, 20, 21, 22, 27, 28, 29, 30]], axis=1)

#replace dashes in overall_score with mean of the column
#mean has been calculated in excel beforehand and must be a string. Otherwise it cannot be fit into overall_scores, which is an object column.
df_dropped['overall_score'] = df_dropped['overall_score'].replace(['-'], '15')

#typecast overall_score into numeric (otherwise, no numerical methods can be applied to it)
df_dropped['overall_score'] = pd.to_numeric(df_dropped['overall_score'], errors='coerce')

#replace NaNs in overall_score (which come from infinity score values that became NaNs through typecasting) with 1000
df_dropped['overall_score'] = df_dropped['overall_score'].fillna(value=1000)

#replace dashes in height and weight with NaN
df_dropped['height'] = df_dropped['height'].replace(['-'], np.nan)
df_dropped['weight'] = df_dropped['weight'].replace(['-'], np.nan)

#remove US Standard Units (inches and lb) from height and weight column
df_dropped['weight'] = df_dropped['weight'].apply(lambda x: re.findall(r'(\d+)\s*kg', str(x)))
df_dropped['height'] = df_dropped['height'].apply(lambda x: re.findall(r'(\d+)\s*cm', str(x)))

#turn entries of height and weight columns into strings (otherwise, they cannot be typecasted into numerics)
df_dropped['weight'] = df_dropped['weight'].str.get(0)
df_dropped['height'] = df_dropped['height'].str.get(0)

#typecast height and weight columns into numerics
df_dropped['weight'] = pd.to_numeric(df_dropped['weight'], errors='coerce')
df_dropped['height'] = pd.to_numeric(df_dropped['height'], errors='coerce')

#rename height and weight columns into height_cm and weight_kg
df_clean = df_dropped.rename(columns = {'height':'height_cm','weight':'weight_kg'})

In [None]:
#Can overall_score be explained by one of the other score variables, or by a sum of them?

#create column that sums all score variables, except of the overall_score
df_clean['sum_scores'] = df_clean.intelligence_score + df_clean.strength_score + df_clean.speed_score + df_clean.durability_score + df_clean.power_score + df_clean.combat_score

#create subset dataframe that contains score variables only
df_scores = df_clean[['intelligence_score', 'strength_score', 'speed_score', 'durability_score', 'power_score', 
         'combat_score', 'overall_score', 'sum_scores']]

#create correlation matrix (Pearson's) and heatmap
corr_matrix = df_scores.corr()

#Plotly Code
fig = go.Figure(data=go.Heatmap(
                 z=corr_matrix, x = ['Intelligence','Strength', 'Speed', 'Durability','Power',
         'Combat', 'Overall score', 'Sum of scores'], y= ['Intelligence','Strength', 'Speed', 'Durability', 'Power',
         'Combat', 'Overall score', 'Sum of scores']))
fig.update_layout(title_text="Pearson's Correlations of Score Variables of Superheroes")
fig.show()

In [None]:
#Show distribution of alignments among superheroes

#Plotly code
alignment = ['Good','Bad','Neutral']
fig = go.Figure([go.Bar(x=alignment, y=[795, 449, 124], marker_color=['blue','orange','green'])])
fig.update_layout(barmode='group', title_text='Distribution of Alignments among Superheroes')
fig.show()

In [None]:
#Which alignment has most superpowers on average?

#create column that sums up superpowers per superhero
df_clean['sum_superpowers'] = df_clean.loc[:, 'has_electrokinesis':'has_super_strength'].sum(axis=1)

#Create a subset dataframe for each alignment level
df_good = df_clean.query("alignment == 'Good'")
df_bad = df_clean.query("alignment == 'Bad'")
df_neutral = df_clean.query("alignment == 'Neutral'")

#Plotly code
alignment = ['Good','Bad','Neutral']
mean_good = round(df_good['sum_superpowers'].mean())
mean_bad = round(df_bad['sum_superpowers'].mean())
mean_neutral = round(df_neutral['sum_superpowers'].mean())
fig = go.Figure([go.Bar(x=alignment, y=[mean_good, mean_bad, mean_neutral], marker_color=['blue','orange','green'])])
fig.update_layout(barmode='group', title_text='Average Number of Superpowers per Alignment')
fig.show()

In [None]:
# Which creator studios have crated how many good, bad and neutral characters?

#Plotly Code
creators=['ABC Studios', 'Capcom', 'Cartoon Network','Clive Barker','DC Comics','Dark Horse Comics','Disney','Dreamworks','George Lucas',
'George R. R. Martin','Hanna-Barbera','HarperCollins','Hasbro','IDW Publishing','Ian Fleming','Icon Comics','Image Comics','J. K. Rowling',
'J. R. R. Tolkien','Konami','Lego','Marvel Comics','Matt Groening','Mattel','Microsoft','Mortal Kombat','NBC - Heroes','Nintendo','Sega',
'Shueisha','Sony Pictures','South Park','Star Trek','Stephen King','SyFy','Team Epic TV','Ubisoft','Universal Studios','Wildstorm']
fig = go.Figure(data=[
    go.Bar(name='Good', x=creators, y=[4,2,1,1,231,18,2,1,11,3,1,6,1,1,1,3,6,5,6,0,12,340,0,0,1,1,13,3,1,24,3,1,4,0,6,5,9,1,6]),
    go.Bar(name='Bad', x=creators, y=[0,1,0,1,155,9,0,0,6,2,0,0,2,1,0,1,10,3,1,1,5,189,0,1,0,1,3,1,0,6,0,0,0,1,0,1,0,0,1]),
    go.Bar(name='Neutral',x=creators, y=[0,1,1,0,43,2,1,0,0,2,0,0,0,0,0,0,1,0,0,0,4,39,1,0,0,1,0,0,0,7,0,0,0,0,0,0,0,0,0])
])
fig.update_layout(barmode='group', title_text='Alignments of Superheroes created by Studios')
fig.show()

In [None]:
#subsets of female and male superheroes
df_female = df_clean[df_clean["gender"] == 'Female']
df_male = df_clean[df_clean["gender"] == 'Male']
df_mean_m = df_male.mean()
df_mean_f = df_female.mean()

align_male = df_male['alignment'].value_counts()
align_female = df_female['alignment'].value_counts()

scores=['Intelligence','Strength','Speed','Durability','Power', 'Combat']

#Is super hero a male job? (percentage bar)
fig3 = go.Figure()
fig3.add_trace(go.Bar( 
    x=[len(df_male)],
    text=['66.9%'], textposition='auto',textfont_size=20,
    name='Men',
    orientation='h',
    marker=dict(
        color='MediumSeaGreen',        
    )
))
fig3.add_trace(go.Bar(
  
    x=[len(df_female)],
    text=['23.1%'], textposition='inside',textfont_size=20,
    name='Women',
    orientation='h',
    marker=dict(
        color='GreenYellow',       
    )    
))
fig3.add_trace(go.Bar(
    
    x=[len(df_clean)-len(df_male)-len(df_female)],
    text=['10%'], textposition='inside',textfont_size=20,
    name='Gender-neutral',
    orientation='h',
    marker=dict(
        color='LightGray',
    )
))
fig3.update_layout(barmode='stack', title_text='Distribution of Genders among Superheroes', yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,))

fig3.show()

In [None]:
#Are characteristics asigned equally between genders?

fig1 = go.Figure(data=[
    go.Bar(name='Men', x=scores, y=df_mean_m[1:7], marker_color='MediumSeaGreen'),
    go.Bar(name='Women', x=scores, y=df_mean_f[1:7], marker_color='GreenYellow'), 
])

fig1.update_layout(barmode='group', title_text='Mean Scores of Male and Female Superheroes')
fig1.show()

In [None]:
#What is the alignment in two genders? Are women supposed to be 'good'?

fig2 = make_subplots(rows=1, cols=2)

fig2.add_trace(
    go.Bar(name='Men', x=['Good', 'Bad', 'Neutral'], y=align_male, marker_color='MediumSeaGreen'),
    row=1, col=1
)

fig2.add_trace(
    go.Bar(name='Women', x=['Good', 'Bad', 'Neutral'], y=align_female, marker_color='GreenYellow'),
    row=1, col=2
)

fig2.update_layout(height=600, width=800, title_text="Superheroes by Alignment")
fig2.show()

In [None]:
#hierarchical clustering
#define dataframe for hierarchical clustering
columns_hc = ['name','overall_score','intelligence_score', 'strength_score', 'speed_score', 'durability_score', 'power_score', 'combat_score',
          'alignment', 'gender', 'has_electrokinesis', 'has_energy_constructs', 'has_mind_control_resistance','has_matter_manipulation', 'has_telepathy_resistance', 'has_mind_control', 'has_enhanced_hearing', 'has_dimensional_travel', 'has_element_control', 'has_size_changing', 'has_fire_resistance',
         'has_fire_control', 'has_dexterity', 'has_reality_warping', 'has_illusions', 'has_energy_beams', 'has_peak_human_condition', 'has_shapeshifting', 'has_heat_resistance', 'has_jump', 'has_self-sustenance', 'has_energy_absorption', 'has_cold_resistance', 'has_magic', 'has_telekinesis', 'has_toxin_and_disease_resistance', 'has_telepathy', 
          'has_regeneration', 'has_immortality', 'has_teleportation', 'has_force_fields', 'has_energy_manipulation', 'has_endurance', 'has_longevity', 'has_weapon-based_powers', 'has_energy_blasts', 'has_enhanced_senses', 'has_invulnerability', 'has_stealth', 'has_marksmanship', 'has_flight', 'has_accelerated_healing', 'has_weapons_master', 
          'has_intelligence','has_reflexes', 'has_super_speed', 'has_durability', 'has_stamina', 'has_agility', 'has_super_strength', 'sum_scores', 'sum_superpowers']
df_hc = pd.DataFrame(df_clean, columns=columns_hc)

#replace gender levels by 1 and 0 and drop NAs
df_hc['gender'].replace('Male','1', inplace = True)
df_hc['gender'].replace('Female','0', inplace = True)
df_hc['gender'] = pd.to_numeric(df_hc['gender'])
df_hc.dropna(inplace=True)

#get dummy variables that replace alignment levels
df_hc = pd.get_dummies(df_hc, columns = ["alignment"])

#subsetting clean dataset to top 20 good superheroines based on overall_score
df_wgood=df_hc[(df_hc['gender']==0)&(df_hc['alignment_Good']==1)]
df_wgood_top20 = (df_wgood.sort_values(by=['overall_score'], ascending = False)).head(20)

#normalizing, clustering based on intelligence_score and strength_score
ov_int_score = whiten(df_wgood_top20[['strength_score','intelligence_score']])
mergings = linkage(ov_int_score,method='ward')
names = np.array(df_wgood_top20['name'])

#plotting
fig4 = ff.create_dendrogram(ov_int_score, labels=names)
fig4.update_layout(width=800, height=500, title_text='Clustering Among 20 Best Superheroines based on Intelligence and Strength')
fig4.show()

In [None]:
#Reduction of dimensionality of numerical variables with PCA

#Define dataframe for PCA
columns3= ['overall_score','intelligence_score', 'strength_score', 'speed_score', 'durability_score', 'power_score', 'combat_score'
         , 'gender', 'sum_superpowers']
df_pca2 = pd.DataFrame(df_clean, columns=columns3) 

#replace gender levels by 1 and 0 (because PCA cannot deal with non-numeric values)
df_pca2['gender'].replace('Male','1', inplace=True)
df_pca2['gender'].replace('Female','0', inplace=True)
df_pca2['gender'] = pd.to_numeric(df_pca2['gender'])

#remove all NAs that are left (because PCA cannot deal with NAs)
df_pca2.dropna(inplace=True)

#Scale dataframe with StandardScaler
scaler2 = StandardScaler()
features2 = df_pca2.to_numpy()
scaled_features = scaler2.fit_transform(features2)
pca = PCA()
pca.fit(scaled_features)

# new features
df_pca2_new = np.transpose(pca.fit_transform(scaled_features))

#get the 3 most important pca components
df_pca2_new[:3]

#create new dataframe for these three features
columns4 = ['PC1', 'PC2', 'PC3']
df_pca2_kmean = pd.DataFrame(data=np.transpose(df_pca2_new[:3]), columns=columns4)

In [None]:
#using Elbow method to choose best number of cluster k
sse = [] # sse is distortions
for k in range(1, 11):
    kmeans_pca = KMeans(init='random', n_clusters = k, n_init=12, max_iter=300, random_state=42)
    kmeans_pca.fit(df_pca2_kmean)
    sse.append(kmeans_pca.inertia_)
#sns.lineplot(x=range(1, 11), y= sse)

In [None]:
#run for k = 4. 
kmeans_pca = KMeans(init='random', n_clusters = 4, n_init=12, max_iter=300, random_state=42)
kmeans_pca.fit(df_pca2_kmean)

In [None]:
#plotly 3D representation with labels
df_pca2_kmean['clusters'] = kmeans_pca.labels_
df_3D_pca2 = df_pca2_kmean
df_3D_pca2['clusters'].replace(0,'cluster 0', inplace=True)
df_3D_pca2['clusters'].replace(1,'cluster 1', inplace=True)
df_3D_pca2['clusters'].replace(2,'cluster 2', inplace=True)
df_3D_pca2['clusters'].replace(3,'cluster 3', inplace=True)
fig = px.scatter_3d(df_3D_pca2, x='PC1', y='PC2', z='PC3',
              color='clusters')
fig.show()

In [None]:
#Reduction of dimensionality of numerical variables with T-SNE

#Define dataframe for T-SNE
columns= ['overall_score','intelligence_score', 'strength_score', 'speed_score', 'durability_score', 'power_score', 'combat_score'
         , 'gender', 'sum_superpowers']
df_tsne = pd.DataFrame(df_clean, columns=columns)

#replace gender levels by 1 and 0
df_tsne['gender'].replace('Male','1', inplace=True)
df_tsne['gender'].replace('Female','0', inplace=True)
df_tsne['gender'] = pd.to_numeric(df_tsne['gender'])

#remove all NAs that are left (because T-SNE need a dataframe that does not contain NAs)
df_tsne.dropna(inplace=True)

#Scale dataframe with StandardScaler
scaler = StandardScaler()
features = df_tsne.to_numpy()
scaled_df_tsne = scaler.fit_transform(features)
model = TSNE(n_components=3, random_state=42)
tsne_data = model.fit_transform(scaled_df_tsne)

#turn scaled data back into a dataframe
columns2 = ['Dim1', 'Dim2', 'Dim3']
tsne_data_kmean = pd.DataFrame(tsne_data, columns=columns2)

In [None]:
#using Elbow method to choose best number of cluster k
sse = [] # sse is distortions
for k in range(1, 11):
    kmeans_tsne = KMeans(init='random', n_clusters = k, n_init=12, max_iter=300, random_state=42)
    kmeans_tsne.fit(tsne_data_kmean)
    sse.append(kmeans_tsne.inertia_)
#sns.lineplot(x=range(1, 11), y= sse)

In [None]:
#run for k = 4. 
kmeans_tsne= KMeans(init='random', n_clusters = 4, n_init=12, max_iter=300, random_state=42)
kmeans_tsne.fit(tsne_data_kmean)

In [None]:
#plotly 3D representation with labels
tsne_data_kmean['clusters'] = kmeans_tsne.labels_
df_3D_tsne = tsne_data_kmean
df_3D_tsne['clusters'].replace(0,'cluster 0', inplace=True)
df_3D_tsne['clusters'].replace(1,'cluster 1', inplace=True)
df_3D_tsne['clusters'].replace(2,'cluster 2', inplace=True)
df_3D_tsne['clusters'].replace(3,'cluster 3', inplace=True)
fig = px.scatter_3d(df_3D_tsne, x='Dim1', y='Dim2', z='Dim3',
              color='clusters')
fig.show()