### 1. Load the data as a pandas dataframe
### 2. create a mapping of the personality types
### 3. replace text values with numerical values
### 4. get rid of the "Response ID" column (duh)
### 5. convert to a pandas array

In [31]:
import numpy as np
import pandas as pd

# Step 1: Load the CSV
df = pd.read_csv('16P.csv', encoding='ISO-8859-1')

# Step 2: Define your mapping dictionary
mbti_mapping = {
    'ESTJ': 0, 'ENTJ': 1, 'ESFJ': 2, 'ENFJ': 3,
    'ISTJ': 4, 'ISFJ': 5, 'INTJ': 6, 'INFJ': 7,
    'ESTP': 8, 'ESFP': 9, 'ENTP': 10, 'ENFP': 11,
    'ISTP': 12, 'ISFP': 13, 'INTP': 14, 'INFP': 15
}

# Step 3: Apply mapping to the "Personality" column
df['Personality'] = df['Personality'].map(mbti_mapping)

# Step 4: Drop non-numerical identifier columns if needed (e.g., "Response Id")
df = df.drop(columns=['Response Id'])

# Step 5: Convert to NumPy 2D array
data_array = df.to_numpy()

# Now data_array is a NumPy 2D array ready for ML usage
print(data_array.shape)

data_array

(59999, 61)


array([[ 0,  0,  0, ...,  0,  0, 11],
       [ 0,  0, -2, ..., -1,  3, 13],
       [ 0,  0,  2, ...,  2,  1,  7],
       ...,
       [ 0,  0,  1, ...,  0, -1, 12],
       [ 0,  0,  1, ...,  1,  0,  4],
       [ 0,  0,  2, ...,  0, -1,  7]], shape=(59999, 61))

### -Split the array into X and y columns
### -Split the data into Train and Test


In [32]:
from sklearn.model_selection import train_test_split

# Assuming your NumPy array is called data_array
# Features = all columns except last, Labels = last column
X = data_array[:, :-1]  # features
y = data_array[:, -1]   # labels

# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# just to make sure the data is split evenly
# unique, counts = np.unique(y_train, return_counts=True)
# print("\nDistribution of unique values:")
# for label, count in zip(unique, counts):
#     print(f"{label}: {count}")


X_train shape: (47999, 60)
y_train shape: (47999,)
X_test shape: (12000, 60)
y_test shape: (12000,)


### Entropy function

In [33]:
def entropy(y):
    classes, counts = np.unique(y, return_counts=True)
    probs = counts / len(y)
    return -np.sum(probs * np.log2(probs))