In [None]:
#For Cleaning and Exploratory Data Analysis
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm
from scipy import stats

#For Plotting
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import plot, show, draw, figure
from mpl_toolkits.mplot3d import Axes3D

#For Plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

#For Dimensionality Reduction and K-Means
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.cluster.vq import kmeans, whiten, vq
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#For Cluster Analysis
import math

#For Decision Tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

#For Hierarchical Clustering
import plotly.figure_factory as ff
from scipy.cluster.hierarchy import linkage, dendrogram

#Settings
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
df = pd.read_csv('superheroes.csv')

# Cleaning

In [None]:
#drop unnecessary columns
df_dropped = df.drop(df.columns[[1, 2, 4, 5, 12, 13, 14, 15, 16, 19, 20, 21, 22, 27, 28, 29, 30]], axis=1)

#replace dashes in overall_score with mean of the column
#mean has been calculated in excel beforehand and must be a string. Otherwise it cannot be fit into overall_scores, which is an object column.
df_dropped['overall_score'] = df_dropped['overall_score'].replace(['-'], '15')

#typecast overall_score into numeric (otherwise, no numerical methods can be applied to it)
df_dropped['overall_score'] = pd.to_numeric(df_dropped['overall_score'], errors='coerce')

#replace NaNs in overall_score (which come from infinity score values that became NaNs through typecasting) with 1000
df_dropped['overall_score'] = df_dropped['overall_score'].fillna(value=1000)

#replace dashes in height and weight with NaN
df_dropped['height'] = df_dropped['height'].replace(['-'], np.nan)
df_dropped['weight'] = df_dropped['weight'].replace(['-'], np.nan)

#remove US Standard Units (inches and lb) from height and weight column
df_dropped['weight'] = df_dropped['weight'].apply(lambda x: re.findall(r'(\d+)\s*kg', str(x)))
df_dropped['height'] = df_dropped['height'].apply(lambda x: re.findall(r'(\d+)\s*cm', str(x)))

#turn entries of height and weight columns into strings (otherwise, they cannot be typecasted into numerics)
df_dropped['weight'] = df_dropped['weight'].str.get(0)
df_dropped['height'] = df_dropped['height'].str.get(0)

#typecast height and weight columns into numerics
df_dropped['weight'] = pd.to_numeric(df_dropped['weight'], errors='coerce')
df_dropped['height'] = pd.to_numeric(df_dropped['height'], errors='coerce')

#rename height and weight columns into height_cm and weight_kg
df_clean = df_dropped.rename(columns = {'height':'height_cm','weight':'weight_kg'})

# Exploratory Data Analysis
## Alignment and Scores

In [None]:
#Can overall_score be explained by one of the other score variables, or by a sum of them?

#create column that sums all score variables, except of the overall_score
df_clean['sum_scores'] = df_clean.intelligence_score + df_clean.strength_score + df_clean.speed_score + df_clean.durability_score + df_clean.power_score + df_clean.combat_score

#create subset dataframe that contains score variables only
df_scores = df_clean[['intelligence_score', 'strength_score', 'speed_score', 'durability_score', 'power_score', 
         'combat_score', 'overall_score', 'sum_scores']]

#create correlation matrix (Pearson's) and heatmap
corr_matrix = df_scores.corr()
#sns.heatmap(corr_matrix, annot=True)
#plt.title("Pearson's Correlations of Score Variables")
#plt.show()

In [None]:
#Plotly Code
fig = go.Figure(data=go.Heatmap(
                 z=corr_matrix, x = ['Intelligence','Strength', 'Speed', 'Durability','Power',
         'Combat', 'Overall score', 'Sum of scores'], y= ['Intelligence','Strength', 'Speed', 'Durability', 'Power',
         'Combat', 'Overall score', 'Sum of scores']))
fig.update_layout(title_text="Pearson's Correlations of Score Variables of Superheroes")
fig.show()

In [None]:
#Show distribution of alignments among superheroes

#df_clean['alignment'].value_counts().plot(kind='bar', rot=0, color=['blue', 'red', 'orange'])
#plt.xlabel('Alignment', labelpad=14)
#plt.ylabel('Number of Superheros', labelpad=14)
#plt.title('Distribution of Alignments among Superheros', y=1.02)
#plt.show()

In [None]:
#Plotly code
alignment = ['Good','Bad','Neutral']
fig = go.Figure([go.Bar(x=alignment, y=[795, 449, 124], marker_color=['blue','orange','green'])])
fig.update_layout(barmode='group', title_text='Distribution of Alignments among Superheroes')
fig.show()

In [None]:
#Which superhero has the most superpowers?

#create column that sums up superpowers per superhero
df_clean['sum_superpowers'] = df_clean.loc[:, 'has_electrokinesis':'has_super_strength'].sum(axis=1)

#Find name and alignment of superhero with most superpowers
max_superpowers = df_clean['sum_superpowers'].max()
name_max_superpowers = df_clean.query('sum_superpowers == @max_superpowers')['name'].values[0]
alignment_max_superpowers = df_clean.query('sum_superpowers == @max_superpowers')['alignment'].values[0]

#Create a subset dataframe, containing only the row of the superhero with the most superpowers
df_name_max_superpowers = df_clean.query('name == @name_max_superpowers')

#Extract the column names of the superpowers (they are stored in an index format, which is bad to handle)
superpowers = df_name_max_superpowers.columns[(df_name_max_superpowers == 1.0).iloc[0]]

#convert index format into a list
list_superpowers = superpowers.tolist()

#convert list into one string (otherwise, the superpowers cannot be stored in one single cell of a dataframe later on)
string_superpowers = ', '.join(map(str, list_superpowers))

#create dataframe, which contains the variables from above
d = {'Name': name_max_superpowers, 'Alignment': alignment_max_superpowers, 'Number of Superpowers': max_superpowers, 'List of Superpowers': string_superpowers}
df_superhero_with_most_superpowers = pd.DataFrame(data=d, index=['1'])

#expand the displayed width of columns, so that the long string of superpowers is displayed
pd.set_option('display.max_colwidth', None)
df_superhero_with_most_superpowers

In [None]:
#Which alignment has most superpowers on average?

#Create a subset dataframe for each alignment level
df_good = df_clean.query("alignment == 'Good'")
df_bad = df_clean.query("alignment == 'Bad'")
df_neutral = df_clean.query("alignment == 'Neutral'")

#Compute mean of sum_superpowers for each subset
mean_good = round(df_good['sum_superpowers'].mean())
mean_bad = round(df_bad['sum_superpowers'].mean())
mean_neutral = round(df_neutral['sum_superpowers'].mean())

#Plot means
#sns.barplot(x = ['Good', 'Bad', 'Neutral'], y = [mean_good, mean_bad, mean_neutral])
#plt.xlabel('Alignment', labelpad=14)
#plt.ylabel('Average Number of Superpowers', labelpad=14)
#plt.title('Average Number of Superpowers per Alignment', y=1.02)
#plt.show()

In [None]:
#Plotly Code
alignment = ['Good','Bad','Neutral']
mean_good = round(df_good['sum_superpowers'].mean())
mean_bad = round(df_bad['sum_superpowers'].mean())
mean_neutral = round(df_neutral['sum_superpowers'].mean())
fig = go.Figure([go.Bar(x=alignment, y=[mean_good, mean_bad, mean_neutral], marker_color=['blue','orange','green'])])
fig.update_layout(barmode='group', title_text='Average Number of Superpowers per Alignment')
fig.show()

In [None]:
#Kruskal-Wallis Test: Does the alignment of a superhero have a significant effect on the number of superpowers he/she develops?
#Kruskal-Wallis serves as an alternative for Anova, since we cannot use Anova because sum_superpowers is not normally distributed

#create 3 dataframes, each containing one factor level of alignment
df_clean_good = df_clean.query("alignment == 'Good'")
df_clean_bad = df_clean.query("alignment == 'Bad'")
df_clean_neutral = df_clean.query("alignment == 'Neutral'")

#perform Kruskal-Wallis Test 
stats.kruskal(df_clean_good['sum_superpowers'], df_clean_bad['sum_superpowers'], df_clean_neutral['sum_superpowers'])
#pvalue=2.9290783036492544e-09
#type of alignment has a statistically significant influence on number of superpowers

In [None]:
# Which creator studios have crated how many good, bad and neutral characters?

#matplotlib plot
#grouped1 = df_clean.groupby(['creator','alignment']).size().reset_index(name='count')
#grouped1.pivot( "creator","alignment", "count").plot(kind='bar', figsize=(20, 6))
#plt.title('Alignments of Superheroes created by Studios')
#plt.show()

In [None]:
#Plotly Code
creators=['ABC Studios', 'Capcom', 'Cartoon Network','Clive Barker','DC Comics','Dark Horse Comics','Disney','Dreamworks','George Lucas',
'George R. R. Martin','Hanna-Barbera','HarperCollins','Hasbro','IDW Publishing','Ian Fleming','Icon Comics','Image Comics','J. K. Rowling',
'J. R. R. Tolkien','Konami','Lego','Marvel Comics','Matt Groening','Mattel','Microsoft','Mortal Kombat','NBC - Heroes','Nintendo','Sega',
'Shueisha','Sony Pictures','South Park','Star Trek','Stephen King','SyFy','Team Epic TV','Ubisoft','Universal Studios','Wildstorm']
fig = go.Figure(data=[
    go.Bar(name='Good', x=creators, y=[4,2,1,1,231,18,2,1,11,3,1,6,1,1,1,3,6,5,6,0,12,340,0,0,1,1,13,3,1,24,3,1,4,0,6,5,9,1,6]),
    go.Bar(name='Bad', x=creators, y=[0,1,0,1,155,9,0,0,6,2,0,0,2,1,0,1,10,3,1,1,5,189,0,1,0,1,3,1,0,6,0,0,0,1,0,1,0,0,1]),
    go.Bar(name='Neutral',x=creators, y=[0,1,1,0,43,2,1,0,0,2,0,0,0,0,0,0,1,0,0,0,4,39,1,0,0,1,0,0,0,7,0,0,0,0,0,0,0,0,0])
])
fig.update_layout(barmode='group', title_text='Alignments of Superheroes created by Studios')
fig.show()

In [None]:
# Poisson regression code

#remove outliers from overall_score and sum_superpowers
df_poisson = df_clean.query("overall_score < 401 & sum_superpowers < 21")

#dependent variable
y=df_poisson['sum_superpowers']

#independent variable
x= df_poisson['overall_score']

exog, endog = sm.add_constant(x), y
mod = sm.GLM(endog, exog,
             family=sm.families.Poisson(link=sm.families.links.log()))
res = mod.fit()
print(res.summary())
'''Interpretation of result: For each additional point in the overall_score of a superhero, the sum of the superpowers of 
the superhero increases by 0.0107 points.'''

In [None]:
#Show Slytherin superheroes

#subset of Harry Potter characters
df_jkr = df_clean.query("creator == 'J. K. Rowling'")

#assign Harry Potter house to each character
df_jkr['house']=['NaN', 'Gryffindor', 'Gryffindor', 'Slytherin', 'Slytherin', 'Gryffindor', 'Slytherin', 'Gryffindor']
#Search for Slytherins
df_jkr_slyth = df_jkr[(df_jkr['house'] == 'Slytherin')]
df_jkr_slyth

## Gender Studies

In [None]:
#subsets of female and male superheroes
df_female = df_clean[df_clean["gender"] == 'Female']
df_male = df_clean[df_clean["gender"] == 'Male']
df_mean_m = df_male.mean()
df_mean_f = df_female.mean()

align_male = df_male['alignment'].value_counts()
align_female = df_female['alignment'].value_counts()

scores=['Intelligence','Strength','Speed','Durability','Power', 'Combat']

In [None]:
#Is super hero a male job? (percentage bar)
fig3 = go.Figure()
fig3.add_trace(go.Bar( 
    x=[len(df_male)],
    text=['66.9%'], textposition='auto',textfont_size=20,
    name='Men',
    orientation='h',
    marker=dict(
        color='MediumSeaGreen',        
    )
))
fig3.add_trace(go.Bar(
  
    x=[len(df_female)],
    text=['23.1%'], textposition='inside',textfont_size=20,
    name='Women',
    orientation='h',
    marker=dict(
        color='GreenYellow',       
    )    
))
fig3.add_trace(go.Bar(
    
    x=[len(df_clean)-len(df_male)-len(df_female)],
    text=['10%'], textposition='inside',textfont_size=20,
    name='Gender-neutral',
    orientation='h',
    marker=dict(
        color='LightGray',
    )
))
fig3.update_layout(barmode='stack', title_text='Distribution of Genders among Superheroes', yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,))

fig3.show()

In [None]:
#Are characteristics asigned equally between genders?

fig1 = go.Figure(data=[
    go.Bar(name='Men', x=scores, y=df_mean_m[1:7], marker_color='MediumSeaGreen'),
    go.Bar(name='Women', x=scores, y=df_mean_f[1:7], marker_color='GreenYellow'), 
])

fig1.update_layout(barmode='group', title_text='Mean Scores of Male and Female Superheroes')
fig1.show()

In [None]:
#What is the alignment in two genders? Are women supposed to be 'good'?

fig2 = make_subplots(rows=1, cols=2)

fig2.add_trace(
    go.Bar(name='Men', x=['Good', 'Bad', 'Neutral'], y=align_male, marker_color='MediumSeaGreen'),
    row=1, col=1
)

fig2.add_trace(
    go.Bar(name='Women', x=['Good', 'Bad', 'Neutral'], y=align_female, marker_color='GreenYellow'),
    row=1, col=2
)

fig2.update_layout(height=600, width=800, title_text="Superheroes by Alignment")
fig2.show()

# Machine Learning
## K-Means
### Dimensionality Reduction with T-SNE

In [None]:
#Reduction of dimensionality of numerical variables with T-SNE

#Define dataframe for T-SNE
columns= ['overall_score','intelligence_score', 'strength_score', 'speed_score', 'durability_score', 'power_score', 'combat_score'
         , 'gender', 'sum_superpowers']
df_tsne = pd.DataFrame(df_clean, columns=columns)

#replace gender levels by 1 and 0
df_tsne['gender'].replace('Male','1', inplace=True)
df_tsne['gender'].replace('Female','0', inplace=True)
df_tsne['gender'] = pd.to_numeric(df_tsne['gender'])

#remove all NAs that are left (because T-SNE need a dataframe that does not contain NAs)
df_tsne.dropna(inplace=True)

In [None]:
#Scale dataframe with StandardScaler
scaler = StandardScaler()
features = df_tsne.to_numpy()
scaled_df_tsne = scaler.fit_transform(features)
model = TSNE(n_components=3, random_state=42)
tsne_data = model.fit_transform(scaled_df_tsne)

#turn scaled data back into a dataframe
columns2 = ['Dim1', 'Dim2', 'Dim3']
tsne_data_kmean = pd.DataFrame(tsne_data, columns=columns2)

### K-Means for T-SNE-reduced Dataframe

In [None]:
#using Elbow method to choose best number of cluster k
sse = [] # sse is distortions
for k in range(1, 11):
    kmeans_tsne = KMeans(init='random', n_clusters = k, n_init=12, max_iter=300, random_state=42)
    kmeans_tsne.fit(tsne_data_kmean)
    sse.append(kmeans_tsne.inertia_)
#sns.lineplot(x=range(1, 11), y= sse)

In [None]:
#run for k = 4. 
kmeans_tsne= KMeans(init='random', n_clusters = 4, n_init=12, max_iter=300, random_state=42)
kmeans_tsne.fit(tsne_data_kmean)

In [None]:
#Show sihlouette score of T-SNE for the chosen number of clusters
print('KMeans Scaled Silhouette Score: {}'.format(silhouette_score(tsne_data_kmean, kmeans_tsne.labels_, metric='euclidean')))

In [None]:
#scatterplot in 2D in matplotlib
#sns.scatterplot(x=tsne_data_kmean['Dim1'], y=tsne_data_kmean['Dim2'], hue = kmeans_tsne.labels_)
#plt.show()

#2d Plotly plot
fig = px.scatter(x=tsne_data_kmean['Dim1'], y=tsne_data_kmean['Dim2'], color = kmeans_tsne.labels_.astype(str))
fig.update_layout(title_text="2D Representation of t-SNE-reduced Clustering", legend_title_text='Clusters')
fig.show()

In [None]:
#Visualize 3 dimensions of T-SNE after running k-means
#sns.set_style("whitegrid", {'axes.grid' : False})

#fig = plt.figure(figsize=(6,6))

#ax = Axes3D(fig)
#x = tsne_data_kmean['Dim1']
#y = tsne_data_kmean['Dim2']
#z = tsne_data_kmean['Dim3']

#ax.scatter(x, y, z, c=kmeans_tsne.labels_, marker='o')
#ax.set_xlabel('Dim1')
#ax.set_ylabel('Dim2')
#ax.set_zlabel('Dim3')

#plt.show()

In [None]:
#plotly 3D representation with labels
tsne_data_kmean['clusters'] = kmeans_tsne.labels_
df_3D_tsne = tsne_data_kmean
df_3D_tsne['clusters'].replace(0,'cluster 0', inplace=True)
df_3D_tsne['clusters'].replace(1,'cluster 1', inplace=True)
df_3D_tsne['clusters'].replace(2,'cluster 2', inplace=True)
df_3D_tsne['clusters'].replace(3,'cluster 3', inplace=True)
fig = px.scatter_3d(df_3D_tsne, x='Dim1', y='Dim2', z='Dim3',
              color='clusters')
fig.show()

### Dimensionality Reduction with PCA

In [None]:
#Reduction of dimensionality of numerical variables with PCA

#Define dataframe for PCA
columns3= ['overall_score','intelligence_score', 'strength_score', 'speed_score', 'durability_score', 'power_score', 'combat_score'
         , 'gender', 'sum_superpowers']
df_pca2 = pd.DataFrame(df_clean, columns=columns3) 

#replace gender levels by 1 and 0 (because PCA cannot deal with non-numeric values)
df_pca2['gender'].replace('Male','1', inplace=True)
df_pca2['gender'].replace('Female','0', inplace=True)
df_pca2['gender'] = pd.to_numeric(df_pca2['gender'])

#remove all NAs that are left (because PCA cannot deal with NAs)
df_pca2.dropna(inplace=True)

In [None]:
#Scale dataframe with StandardScaler
scaler2 = StandardScaler()
features2 = df_pca2.to_numpy()
scaled_features = scaler2.fit_transform(features2)
pca = PCA()
pca.fit(scaled_features)

# new features
df_pca2_new = np.transpose(pca.fit_transform(scaled_features))

#get the 3 most important pca components
df_pca2_new[:3]

#create new dataframe for these three features
columns4 = ['PC1', 'PC2', 'PC3']
df_pca2_kmean = pd.DataFrame(data=np.transpose(df_pca2_new[:3]), columns=columns4)

### K-Means for PCA-reduced Dataframe

In [None]:
#using Elbow method to choose best number of cluster k
sse = [] # sse is distortions
for k in range(1, 11):
    kmeans_pca = KMeans(init='random', n_clusters = k, n_init=12, max_iter=300, random_state=42)
    kmeans_pca.fit(df_pca2_kmean)
    sse.append(kmeans_pca.inertia_)
#sns.lineplot(x=range(1, 11), y= sse)

In [None]:
#run for k = 4. 
kmeans_pca = KMeans(init='random', n_clusters = 4, n_init=12, max_iter=300, random_state=42)
kmeans_pca.fit(df_pca2_kmean)

In [None]:
#Show sihlouette score of PCA for the chosen number of clusters
print('KMeans Scaled Silhouette Score: {}'.format(silhouette_score(df_pca2_kmean, kmeans_pca.labels_, metric='euclidean')))

In [None]:
#scatterplot in 2D
#sns.scatterplot(x=df_pca2_kmean['PC1'], y=df_pca2_kmean['PC2'], hue = kmeans_pca.labels_)
#plt.show()

#2d Plotly plot
fig = px.scatter(x=df_pca2_kmean['PC1'], y=df_pca2_kmean['PC2'], color = kmeans_pca.labels_.astype(str))
fig.update_layout(title_text="2D Representation of PCA-reduced Clustering", legend_title_text='Clusters')
fig.show()

In [None]:
#Visualize 3 dimensions of PCA after running k-means
#sns.set_style("whitegrid", {'axes.grid' : False})

#fig = plt.figure(figsize=(6,6))

#ax = Axes3D(fig)
#x = df_pca2_kmean['PC1']
#y = df_pca2_kmean['PC2']
#z = df_pca2_kmean['PC3']

#ax.scatter(x, y, z, c=kmeans_pca.labels_, marker='o')
#ax.set_xlabel('PC1')
#ax.set_ylabel('PC2')
#ax.set_zlabel('PC3')

#plt.show()

In [None]:
#plotly 3D representation with labels
df_pca2_kmean['clusters'] = kmeans_pca.labels_
df_3D_pca2 = df_pca2_kmean
df_3D_pca2['clusters'].replace(0,'cluster 0', inplace=True)
df_3D_pca2['clusters'].replace(1,'cluster 1', inplace=True)
df_3D_pca2['clusters'].replace(2,'cluster 2', inplace=True)
df_3D_pca2['clusters'].replace(3,'cluster 3', inplace=True)
fig = px.scatter_3d(df_3D_pca2, x='PC1', y='PC2', z='PC3',
              color='clusters')
fig.show()

### K-Means without Dimensionality Reduction

In [None]:
#run k-means without feature reduction

#Define dataframe 
columns4= ['overall_score','intelligence_score', 'strength_score', 'speed_score', 'durability_score', 'power_score', 'combat_score'
         , 'gender', 'sum_superpowers']
df_no_reduction = pd.DataFrame(df_clean, columns=columns4) 

#Turn Gender into numerics
df_no_reduction['gender'].replace('Male','1', inplace=True)
df_no_reduction['gender'].replace('Female','0', inplace=True)
df_no_reduction['gender'] = pd.to_numeric(df_no_reduction['gender'])

#Drop NAs
df_no_reduction.dropna(inplace=True)

#Scale features of dataframe
scaler3 = StandardScaler()
features3 = df_no_reduction.to_numpy()
kmean_df_no_reduction = scaler3.fit_transform(features3)

#Determine best k for k-means
sse = [] # sse is distortions
for k in range(1, 11):
    kmeans_orig = KMeans(init='random', n_clusters = k, n_init=12, max_iter=300, random_state=42)
    kmeans_orig.fit(kmean_df_no_reduction)
    sse.append(kmeans_orig.inertia_)
#sns.lineplot(x=range(1, 11), y= sse)

#run for k = 4.
kmeans_orig = KMeans(init='random', n_clusters = 4, n_init=12, max_iter=300, random_state=42)
kmeans_orig.fit(kmean_df_no_reduction)
kmeans_orig.inertia_

In [None]:
#Show sihlouette score of k-means without feature reduction for the chosen number of clusters
print('KMeans Scaled Silhouette Score: {}'.format(silhouette_score(kmean_df_no_reduction, kmeans_orig.labels_, metric='euclidean')))

### Summary of K-Means 
In order to find the best clustering result, the K-Means algorithm has been applied to three different dataframes of the original cleaned dataset:
1. T-SNE reduced Dataframe
2. PCA-reduced Dataframe
3. Unreduced Dataframe<br>
All three dataframes have in common, that they only contain numerical data and that they do not contain NAs. Otherwise, the feature reductions would not work. Different selections of columns have been tested, and those, that are now loaded in the dataframes, have produced the best visually clustered results. For example, 'sum_scores' and 'alignment' have been left out: 'sum_scores', since it decreased the silhouette score, and 'alignment', since it dominated the clustering algorithm so severely, that the clusters were produced based almost on the alignment levels only.<br>After all, the T-SNE-reduced dataframe has produced the best clustering result (visually and according to the Silhouette Score) and will therefore be analyzed more in detail in the following chapter.

### Cluster Analysis of K-Means results for T-SNE-reduced Dataframe

In [None]:
#Cluster analysis of T-SNE result

#Define function which will help to truncate decimal values
def truncate(number, digits) -> float:
    stepper = 10.0 ** digits
    return math.trunc(stepper * number) / stepper

#Determine number of superheroes in T-SNE dataframe
len_tsne = df_tsne.shape[0]

In [None]:
#ANALYSIS OF CLUSTER 0
#Create dataframe that only contains superheroes that have been assigned to cluster 0 by k-means algorithm
labels_0 = np.where(kmeans_tsne.labels_ == 0)
df_cluster_0 = df_tsne.iloc[labels_0]

#compute ratio of number of superheroes in cluster 0 with respect to total number of superheroes in all clusters
len_0 = df_cluster_0.shape[0]
ratio_0 = truncate(len_0 / len_tsne * 100, 1)

#compute ratios of gender
f_0 = truncate(df_cluster_0[df_clean['gender']=='Female'].shape[0] / len_0 * 100, 1)
m_0 = truncate(df_cluster_0[df_clean['gender']=='Male'].shape[0] / len_0 * 100, 1)

#compute ratios of alignments
g_0 = truncate(df_cluster_0[df_clean['alignment']=='Good'].shape[0] / len_0 * 100, 1)
b_0 = truncate(df_cluster_0[df_clean['alignment']=='Bad'].shape[0] / len_0 * 100, 1)
n_0 = truncate(df_cluster_0[df_clean['alignment']=='Neutral'].shape[0] / len_0 * 100, 1)
u_0 = truncate(df_cluster_0[(df_clean['alignment'].isnull())].shape[0] / len_0 * 100, 1)

#get basic statistics of cluster 0
#print(df_cluster_0.mean(axis=0).round(1), df_cluster_0.max(axis=0), df_cluster_0.min(axis=0))


#ANALYSIS OF CLUSTER 1
#Create dataframe that only contains superheroes that have been assigned to cluster 1 by k-means algorithm
labels_1 = np.where(kmeans_tsne.labels_ == 1)
df_cluster_1 = df_tsne.iloc[labels_1]

#compute ratio of number of superheroes of cluster 1 with respect to total number of superheroes in all clusters
len_1 = df_cluster_1.shape[0]
ratio_1 = truncate(len_1 / len_tsne * 100, 1)

#compute ratios of gender
f_1 = truncate(df_cluster_1[df_clean['gender']=='Female'].shape[0] / len_1 * 100, 1)
m_1 = truncate(df_cluster_1[df_clean['gender']=='Male'].shape[0] / len_1 * 100, 1)

#compute ratios of alignment
g_1 = truncate(df_cluster_1[df_clean['alignment']=='Good'].shape[0] / len_1 * 100, 1)
b_1 = truncate(df_cluster_1[df_clean['alignment']=='Bad'].shape[0] / len_1 * 100, 1)
n_1 = truncate(df_cluster_1[df_clean['alignment']=='Neutral'].shape[0] / len_1 * 100, 1)
u_1 = truncate(df_cluster_1[(df_clean['alignment'].isnull())].shape[0] / len_1 * 100, 1)

#check for basic statistics of cluster 1
#print(df_cluster_1.mean(axis=0).round(1), df_cluster_1.max(axis=0), df_cluster_1.min(axis=0))

In [None]:
#ANALYSIS OF CLUSTER 2
#Create dataframe that only contains superheroes that have been assigned to cluster 2 by k-means algorithm
labels_2 = np.where(kmeans_tsne.labels_ == 2)
df_cluster_2 = df_tsne.iloc[labels_2]

#compute ratio of number of superheroes in cluster 2 with respect to total number of superheroes in all clusters
len_2 = df_cluster_2.shape[0]
ratio_2 = truncate(len_2 / len_tsne * 100, 1)

#compute ratios of gender
f_2 = truncate(df_cluster_2[df_clean['gender']=='Female'].shape[0] / len_2 * 100, 1)
m_2 = truncate(df_cluster_2[df_clean['gender']=='Male'].shape[0] / len_2 * 100, 1)

#compute ratios of alignments
g_2 = truncate(df_cluster_2[df_clean['alignment']=='Good'].shape[0] / len_2 * 100, 1)
b_2 = truncate(df_cluster_2[df_clean['alignment']=='Bad'].shape[0] / len_2 * 100, 1)
n_2 = truncate(df_cluster_2[df_clean['alignment']=='Neutral'].shape[0] / len_2 * 100, 1)
u_2 = truncate(df_cluster_2[(df_clean['alignment'].isnull())].shape[0] / len_2 * 100, 1)

#get basic statistics of cluster 2
#print(df_cluster_2.mean(axis=0).round(1), df_cluster_2.max(axis=0), df_cluster_2.min(axis=0))


#ANALYSIS OF CLUSTER 3
#Create dataframe that only contains superheroes that have been assigned to cluster 3 by k-means algorithm
labels_3 = np.where(kmeans_tsne.labels_ == 3)
df_cluster_3 = df_tsne.iloc[labels_3]

#compute ratio of number of superheroes in cluster 3 with respect to total number of superheroes in all clusters
len_3 = df_cluster_3.shape[0]
ratio_3 = truncate(len_3 / len_tsne * 100, 1)

#compute ratios of gender
f_3 = truncate(df_cluster_3[df_clean['gender']=='Female'].shape[0] / len_3 * 100, 1)
m_3 = truncate(df_cluster_3[df_clean['gender']=='Male'].shape[0] / len_3 * 100, 1)

#compute ratios of alignments
g_3 = truncate(df_cluster_3[df_clean['alignment']=='Good'].shape[0] / len_3 * 100, 1)
b_3 = truncate(df_cluster_3[df_clean['alignment']=='Bad'].shape[0] / len_3 * 100, 1)
n_3 = truncate(df_cluster_3[df_clean['alignment']=='Neutral'].shape[0] / len_3 * 100, 1)
u_3 = truncate(df_cluster_3[(df_clean['alignment'].isnull())].shape[0] / len_3 * 100, 1)

#get basic statistics of cluster 3
#print(df_cluster_3.mean(axis=0).round(1), df_cluster_3.max(axis=0), df_cluster_3.min(axis=0))

In [None]:
#Plot table of Cluster Analysis

fig, ax = plt.subplots()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
plt.box(on=None)

columns_table = ('Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3')
rows_table = ('Number of Superheroes', 'Percentage of Total', 'Male (%)', 'Female (%)', 'Good (%)', 'Bad (%)', 'Neutral (%)',
             'Undefined (%)', 'Comments\n')
data = [[len_0, len_1, len_2, len_3],
       [ratio_0, ratio_1, ratio_2, ratio_3],
       [m_0, m_1, m_2, m_3],
       [f_0, f_1, f_2, f_3],
       [g_0, g_1, g_2, g_3],
       [b_0, b_1, b_2, b_3],
       [n_0, n_1, n_2, n_3],
       [u_0, u_1, u_2, u_3],
       ['Scores highly above average\nContains all superheroes with infinity values', 'Scores are below average', 'Scores are heavily below average\noverall_scores always = 15', 'Scores are below average']]

cluster_table = plt.table(cellText=data, rowColours=["palegreen"] * 9, colColours=["palegreen"] * 9, rowLabels=rows_table,
                      colLabels=columns_table,
                      loc='center')
cluster_table.auto_set_font_size(False)
cluster_table.set_fontsize(15)
cluster_table.scale(5, 5)
plt.show()

### Summary of Cluster Analysis
Each of the four clusters, that resulted from k-Means applied to the t-SNE-reduced dataframe, was investigated with simple mathematical calculations regarding its absolute and relative size, its gender distribution and its alignment distribution.<br>Cluster 0 is almost completely male, and contains, among other strong superheroes, all superheroes with infinity values as overall_scores (16 male, 2 female). Therefore, the mean scores of cluster 0 are highly above average.<br>Cluster 1 is completely female, and the mean scores lie below the average of the complete dataset.<br> Cluster 2 is mostly male and contains a huge number of superheroes, which all have 15 as overall_score. Therefore, the mean of all score in this cluster is below average.<br> Cluster 3 is completely male and contains the weak male superheroes.<br>We can conclude that the cluster algorithm has produced the clusters primarily based in gender, and secondarily based on the mean of all scores, among which the overall_score seems to have dominated.

## Decision Tree

In [None]:
sh_rf = df_clean[['gender','overall_score','sum_superpowers','intelligence_score','strength_score','speed_score','durability_score','power_score', 'combat_score']]
sh_rf = sh_rf[(sh_rf["gender"] == 'Male') | (sh_rf["gender"] == 'Female')]
#sh_rf = pd.get_dummies(sh_rf, columns=['gender'])

y = sh_rf['gender'].values
X = sh_rf[['overall_score', 'sum_superpowers','intelligence_score','strength_score','speed_score','durability_score','power_score', 'combat_score']].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


clf=DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
tree.plot_tree(clf, fontsize=10)


In [None]:
feature_cols = ['overall score', 'sum of superpowers','intelligence','strength','speed','durability','power', 'combat']

In [None]:
#vizualising the whole tree (not recommended, wherefore all lines are commented out)
#dot_data = StringIO()
#export_graphviz(clf, out_file=dot_data,  
                #filled=True, rounded=True,
                #special_characters=True,feature_names = feature_cols,
                #class_names=['male','female'])
#graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
#Image(graph.create_png())

In [None]:
# pruning the tree
clf1 = DecisionTreeClassifier(criterion="entropy", max_depth=3)
clf1 = clf1.fit(X_train,y_train)
y_pred = clf1.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
#visualizing the pruned tree
dot_data = StringIO()
export_graphviz(clf1, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['male','female'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

## Hierarchical Clustering

In [None]:
#define dataframe for hierarchical clustering
columns_hc = ['name','overall_score','intelligence_score', 'strength_score', 'speed_score', 'durability_score', 'power_score', 'combat_score',
          'alignment', 'gender', 'has_electrokinesis', 'has_energy_constructs', 'has_mind_control_resistance','has_matter_manipulation', 'has_telepathy_resistance', 'has_mind_control', 'has_enhanced_hearing', 'has_dimensional_travel', 'has_element_control', 'has_size_changing', 'has_fire_resistance',
         'has_fire_control', 'has_dexterity', 'has_reality_warping', 'has_illusions', 'has_energy_beams', 'has_peak_human_condition', 'has_shapeshifting', 'has_heat_resistance', 'has_jump', 'has_self-sustenance', 'has_energy_absorption', 'has_cold_resistance', 'has_magic', 'has_telekinesis', 'has_toxin_and_disease_resistance', 'has_telepathy', 
          'has_regeneration', 'has_immortality', 'has_teleportation', 'has_force_fields', 'has_energy_manipulation', 'has_endurance', 'has_longevity', 'has_weapon-based_powers', 'has_energy_blasts', 'has_enhanced_senses', 'has_invulnerability', 'has_stealth', 'has_marksmanship', 'has_flight', 'has_accelerated_healing', 'has_weapons_master', 
          'has_intelligence','has_reflexes', 'has_super_speed', 'has_durability', 'has_stamina', 'has_agility', 'has_super_strength', 'sum_scores', 'sum_superpowers']
df_hc = pd.DataFrame(df_clean, columns=columns_hc)

#replace gender levels by 1 and 0 and drop NAs
df_hc['gender'].replace('Male','1', inplace = True)
df_hc['gender'].replace('Female','0', inplace = True)
df_hc['gender'] = pd.to_numeric(df_hc['gender'])
df_hc.dropna(inplace=True)

#get dummy variables that replace alignment levels
df_hc = pd.get_dummies(df_hc, columns = ["alignment"])

#subsetting clean dataset to top 20 good superheroines based on overall_score
df_wgood=df_hc[(df_hc['gender']==0)&(df_hc['alignment_Good']==1)]
df_wgood_top20 = (df_wgood.sort_values(by=['overall_score'], ascending = False)).head(20)

#normalizing, clustering based on intelligence_score and strength_score
ov_int_score = whiten(df_wgood_top20[['strength_score','intelligence_score']])
mergings = linkage(ov_int_score,method='ward')
names = np.array(df_wgood_top20['name'])

#plotting
fig4 = ff.create_dendrogram(ov_int_score, labels=names)
fig4.update_layout(width=800, height=500, title_text='Clustering Among 20 Best Superheroines based on Intelligence and Strength')
fig4.show()

In [None]:
#Investigating hierarchical clusters
df_wgood_top20.reset_index()

#cluster1
ind_list1=[4,7,8,9]
df_hc_1 = df_wgood_top20.iloc[ind_list1]
df_hc_1

#sub-cluster 2.1
ind_list2_1=[3,5,6,10, 11, 13, 14, 15, 17, 18, 19]
df_hc_2_1 = df_wgood_top20.iloc[ind_list2_1]
df_hc_2_1

#sub-cluster2.2
ind_list2_2=[0, 1, 2, 12, 16]
df_hc_2_2 = df_wgood_top20.iloc[ind_list2_2]
df_hc_2_2

### Summary of Hierarchical Clustering
Among the top 20 good women based on overall score, we want to see the clusters based on intelligence_score and strength_score.<br>Cluster 1 has high intelligence and very high strength score, the average overall score is 20.<br>
Cluster 2.1 has very high strength and intelligence score, but the average overall score is lower than 20.<br>
Cluster 2.2 has absolutely high intelligence_score, very low length score but this cluster gets the highest overall score.