In [None]:
# October 29, 2023
# Author: Thomas Hsiao
# Description: Train a model to evaluate whether a character is good or bad based on features (e.g., abilities, height)
# Source Data: https://www.kaggle.com/datasets/baraazaid/superherodb

# Index
# Step 1: Data Preprocessing 
    # 1) Import Data Frame with Correct Datatypes
    # 2) Review data and remove records that are complete duplicates
    # 3) Remove columns that have >= 50% NULL values PENDING manual review check to make sure they are not critical fields planned to use for training

# Step 2: Feature Selection using Random Forest    
    # 4) Random forest and identify prioritized features
    # 5) Run the Random Forest and observe acuracy.

# Assumptions and Considerations
    # 1. Alignment (target variable) was found to have NaN records for 25% of the data. These were removed as I only wanted to consider variables that have an Alignment determination
    # 2. I removed fields from Feature consideration if they have more than 50% of the records are NaN.

# Arbitrary Stuff
    # Data considerations (e.g., what to do with fields missing a lot of values, duplicates in data, etc.)
    # Deciding which fields to include for feature consideration
    # Deciding which features to ultimately use

# Key Considerations
    # !! Know the data - Most important first step.
    # E.g.,
    # 1. does the target variable have NULL values; if so, consider what to do with these (e.g., impute or remove)
    # 2. are there complete dupes in the data? If so, remove these to avoid biasing the dataset.
    # 3. are there fields that have mainly NULL values, if so remove these from feature consideration.
    # 4. for key features you are interested in, make sure the data is standardized (e.g., credit_and_debit vs debit_and_credit, or durability vs durability_power). This should be standardized PRIOR to one-hot encoding.

    # !! Training Assumptions and considerations: 
    # 5. Sci-kit learn RandomForest can determine feature importance. Can use this to guide which features to include.
    # 6. You will need to arbitrarily determine how many features to include. 

In [180]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier # will be used for Random Forest
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# the below columns were shown to have mixed data types; to address, I set these mixed columns to strings.
dtype_dict = {
  4: str,  # Column 4
  6: str,  # Column 6
  8: str,  # Column 8
  16: str,  # Column 16
  17: str,  # Column 17
  19: str,  # Column 19
  23: str,  # Column 23
  24: str,  # Column 24
  25: str,  # Column 25
  27: str,  # Column 27
  30: str,  # Column 30
  31: str,  # Column 31
  32: str,  # Column 32
  33: str,  # Column 33
  35: str  # Column 35
}

src_df = pd.read_csv('/Users/thomashsiao/Desktop/Coding/SourceData/Superheroes.csv', dtype=dtype_dict)

In [181]:
print(len(src_df)) #28,118 records

src_df.head(10)

28118


Unnamed: 0,Alignment,Alter_Egos,Base,Character,Class_value,Collections,Combat,Creator,Durability,Equipment,...,Species,Speed,Speed_velocity,Strength,Strength_force,Super_powers,Tier,Universe,Weight,History
0,Good,,,Alucard,5471.0,,80,Konami,35,,...,Human,30,2000,30,100000,"Accelerated Healing,Agility,Berserk Mode,Blood...",2,C17S,82 kg • 180.78 lbs,
1,Good,,,Alucard,5876.0,,80,Konami,35,,...,Human,30,2000,30,100000,"Accelerated Healing,Agility,Berserk Mode,Blood...",2,C17S,82 kg • 180.78 lbs,
2,Good,,,Alucard,2586011.0,,90,Konami,100,,...,Vampire,100,1000000000,100,1000000000,"Accelerated Healing,Acrobatics,Agility,Blood M...",7,Castlevania,-,
3,Good,,,Alucard,3380098.0,,90,Konami,100,,...,Vampire,100,1000000000,100,1000000000,"Accelerated Healing,Acrobatics,Agility,Blood M...",7,Castlevania,-,
4,Good,,,Alucard,1170.0,,100,,45,,...,,50,10000,50,400000,"Accelerated Healing,Acrobatics,Agility,Berserk...",2,ML,-,
5,Good,,,Alucard,1362.0,,100,,45,,...,,50,10000,50,400000,"Accelerated Healing,Acrobatics,Agility,Berserk...",2,ML,-,
6,Neutral,,,Bayonetta,100627.0,"Anti-heroes,Gamekings,Roleplaying Collection,V...",90,Sega,75,Umbran Armor,...,Human,50,2997925,55,1000000,"Agility,Chain Manipulation,Durability,Energy A...",4,Bayonetta,90.7 kg • 199.96 lbs,
7,Neutral,"Bayonetta,Bayonetta,Bayonetta,Bayonetta,Bayonetta",,Bayonetta,171876.0,"Anti-heroes,Characters With Yellow Rating,Deat...",90,Sega,75,"Umbran Armor,Eyes Of The World",...,Human,50,2997925,55,1000000,"Agility,Chain Manipulation,Durability,Energy A...",4,Bayonetta,90.7 kg • 199.96 lbs,
8,Good,,,Adam,40.0,,40,,15,,...,Human,10,18,25,8000,"Acrobatics,Agility,Durability,Jump,Stamina,Sup...",1,The Hollow,-,
9,Good,,,Adam,44.0,,40,,15,,...,Human,10,18,25,8000,"Acrobatics,Agility,Durability,Jump,Stamina,Sup...",1,The Hollow,-,


In [182]:
# Check for complete duplicates and remove them
exact_duplicates = src_df[src_df.duplicated(keep=False)]
# manual review a few records
exact_duplicates 

# Only keep the first record for these duplicates, and use new DF object for table so don't touch the source
df = src_df.drop_duplicates(keep='first')

print(len(df)) #26,281 records after initial de-duplication

26281


In [183]:
# Review data fields looking for fields with high NA values. 
# Manually check to make sure these fields are not critical.
total_records = len(df)
missing_percentage = (df.isna().sum() / total_records) * 100
missing_percentage.sort_values(ascending=False)

Leader            98.181196
Member            94.737643
Formerly          94.737643
Equipment         93.976637
Alter_Egos        87.580381
Full_name         82.230509
Base              77.831894
Place_of_birth    77.428561
History           75.449945
Relatives         72.189034
Collections       67.988281
Occupation        61.729006
Species           37.978007
Name              31.174613
Hair_color        27.350557
Eye_color         26.943419
Gender            25.558388
Alignment         24.713671
Creator           16.270309
Super_powers      12.335908
Universe          11.932575
Intelligence       0.677295
Speed              0.677295
Weight             0.677295
Tier               0.677295
Strength_force     0.677295
Strength           0.677295
Speed_velocity     0.677295
Class_value        0.677295
IQ                 0.677295
Power              0.677295
Combat             0.677295
Omniscient         0.677295
Omnipresent        0.677295
Omnipotent         0.677295
Durability         0

In [184]:
# Get the columns with missing percentage greater than 50%
high_missing_columns = missing_percentage[missing_percentage > 50].index
# Create a new DataFrame without these columns
df = df.drop(columns=high_missing_columns)

In [185]:
# Target value is NaN for almost 25% of the records. For random forest, or classification algorithms
# I need to not have NaN values in my target variable. Remove these as I don't want to have those influence my 
# training, and there are too many to try to inmpute with any sort of accuracy.

# Get # NaN and non-NaN records with Alignment 
df[df['Alignment'].isnull()].shape[0]
df[~df['Alignment'].isnull()].shape[0]

# Remove these records for now.
df = df.dropna(subset=['Alignment'])

print(len(df))

19786


In [186]:
df.head(10)

# it appears based on how the data was aggregated in the source data, there can be multiple records for a given character.
# I can either a) just run with it as is, knowing that some characters may skew the results (e.g., if character A is in the data 10 times and might skew how a model is trained), or
# b) deduplicate, but potentially lose out on some values. 

# I'm leaning towards part B, as I don't want the model to not heavily skew towards certain characters, 
# but for now leave all records in. 

Unnamed: 0,Alignment,Character,Class_value,Combat,Creator,Durability,Eye_color,Gender,Hair_color,Height,...,Power,Species,Speed,Speed_velocity,Strength,Strength_force,Super_powers,Tier,Universe,Weight
0,Good,Alucard,5471.0,80,Konami,35,Red,Male,Blond,"191 cm • 6'3.2""",...,60,Human,30,2000,30,100000,"Accelerated Healing,Agility,Berserk Mode,Blood...",2,C17S,82 kg • 180.78 lbs
1,Good,Alucard,5876.0,80,Konami,35,Red,Male,Blond,"191 cm • 6'3.2""",...,60,Human,30,2000,30,100000,"Accelerated Healing,Agility,Berserk Mode,Blood...",2,C17S,82 kg • 180.78 lbs
2,Good,Alucard,2586011.0,90,Konami,100,Gold,Male,White,-,...,100,Vampire,100,1000000000,100,1000000000,"Accelerated Healing,Acrobatics,Agility,Blood M...",7,Castlevania,-
3,Good,Alucard,3380098.0,90,Konami,100,Gold,Male,White,-,...,100,Vampire,100,1000000000,100,1000000000,"Accelerated Healing,Acrobatics,Agility,Blood M...",7,Castlevania,-
4,Good,Alucard,1170.0,100,,45,Blue,Male,Blond,-,...,80,,50,10000,50,400000,"Accelerated Healing,Acrobatics,Agility,Berserk...",2,ML,-
5,Good,Alucard,1362.0,100,,45,Blue,Male,Blond,-,...,80,,50,10000,50,400000,"Accelerated Healing,Acrobatics,Agility,Berserk...",2,ML,-
6,Neutral,Bayonetta,100627.0,90,Sega,75,Grey,Female,Black,"231.1 cm • 7'7""",...,100,Human,50,2997925,55,1000000,"Agility,Chain Manipulation,Durability,Energy A...",4,Bayonetta,90.7 kg • 199.96 lbs
7,Neutral,Bayonetta,171876.0,90,Sega,75,Grey,Female,Black,"231.1 cm • 7'7""",...,100,Human,50,2997925,55,1000000,"Agility,Chain Manipulation,Durability,Energy A...",4,Bayonetta,90.7 kg • 199.96 lbs
8,Good,Adam,40.0,40,,15,Black,Male,Brown / Black,-,...,35,Human,10,18,25,8000,"Acrobatics,Agility,Durability,Jump,Stamina,Sup...",1,The Hollow,-
9,Good,Adam,44.0,40,,15,Black,Male,Brown / Black,-,...,35,Human,10,18,25,8000,"Acrobatics,Agility,Durability,Jump,Stamina,Sup...",1,The Hollow,-


In [187]:
df.columns

Index(['Alignment', 'Character', 'Class_value', 'Combat', 'Creator',
       'Durability', 'Eye_color', 'Gender', 'Hair_color', 'Height', 'IQ',
       'Intelligence', 'Level', 'Name', 'Omnipotent', 'Omnipresent',
       'Omniscient', 'Power', 'Species', 'Speed', 'Speed_velocity', 'Strength',
       'Strength_force', 'Super_powers', 'Tier', 'Universe', 'Weight'],
      dtype='object')

In [193]:
df_encoded['Super_powers'] = df_encoded['Super_powers'].str.split(',', expand=True).apply(lambda x: ','.join([str(i) for i in x if isinstance(i, str)][:10]), axis=1)
dummies = df_encoded['Super_powers'].str.get_dummies(',')
dummies.columns = dummies.columns + '_power'
df_encoded = df_encoded.join(dummies, rsuffix='_power')
df_encoded = df_encoded.drop('Super_powers', axis=1)

In [195]:
# consider any field with "_power" (which was one-hot encoded for the first 10 abilities in each record in "super_power" field) for feature consideration.
# this has an assumption that the most important abilities are listed first, vs alphabetically.
# if alphabetically, then it will be skewed towards powers that start with A, vs powers truly centric to the character
# there also should be cleaning on the ability names, e.g., sometime it could be listed as "durability" and others as "durability power". These should be considered the same, but currently th

# set target variable
y = df_encoded['Alignment']

# Filter columns containing "_power"
power_columns = [col for col in df_encoded.columns if "_power" in col]
x = df_encoded[power_columns]

# Initialize the model
rf_model = RandomForestClassifier()

# Fit the model to data
rf_model.fit(x, y)

# Get feature importances
feature_importances = rf_model.feature_importances_


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [197]:
# VISUALIZE FEATURE IMPORTANCE

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({'Feature': x.columns, 'Importance': feature_importances})
# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

feature_importance_df

Unnamed: 0,Feature,Importance
581,Endurance_power_power,8.588901e-03
126,Endurance_power,8.512615e-03
564,Durability_power_power,8.468857e-03
109,Durability_power,8.346004e-03
552,Dexterity_power_power,7.065431e-03
...,...,...
281,Paradox Manipulation_power,8.124731e-08
736,Paradox Manipulation_power_power,0.000000e+00
427,Vision - Cryo_power,0.000000e+00
822,Spatial Communication_power_power,0.000000e+00


In [199]:
top_features = feature_importance_df['Feature'][:10].tolist()

x_selected = x[top_features]

x_selected

Unnamed: 0,Endurance_power_power,Endurance_power,Durability_power_power,Durability_power,Dexterity_power_power,Dexterity_power,Acrobatics_power_power,Accelerated Healing_power_power,Agility_power,Accelerated Healing_power
0,1,1,1,1,1,1,0,1,1,1
1,1,1,1,1,1,1,0,1,1,1
2,0,0,1,1,1,1,1,1,1,1
3,0,0,1,1,1,1,1,1,1,1
4,0,0,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
27934,0,0,1,1,0,0,0,0,1,0
27936,0,0,0,0,0,0,0,0,0,0
27937,0,0,0,0,0,0,0,0,1,0
27938,0,0,1,1,0,0,0,0,1,0


In [200]:
# TRAIN AND EVAL RANDOM FOREST
x_train, x_test, y_train, y_test = train_test_split(x_selected, y, test_size=0.2, random_state=42)

rf_model_selected = RandomForestClassifier()
rf_model_selected.fit(x_train, y_train)

accuracy = rf_model_selected.score(x_test, y_test)
print(f'Accuracy: {accuracy}')

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Accuracy: 0.4967155128852956


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [None]:
import os

# Get the path to the desktop.
desktop_path = os.path.expanduser('~/Desktop')

# Join the relative path with the path to the desktop.
csv_path = os.path.join(desktop_path, 'my_data.csv')

# Save the DataFrame to a CSV file.
df_encoded.to_csv(csv_path, index=False)