In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import InputLayer
from sklearn.model_selection import train_test_split
import keras_tuner as kt

In [4]:
def load_data():
    train_data = pd.read_csv('train.csv')
    return train_data
train_data = load_data()
target_data = train_data["Personality"].copy()
training_data = train_data.drop(columns=["Personality","id"])

training_data['Socializing_effect'] = training_data['Social_event_attendance'] + training_data['Going_outside'] 
training_data['probability_of_having_friends'] = training_data['Socializing_effect'] / (training_data['Socializing_effect'].max() + 1e-5)
training_data['prob_of_going_outside'] = training_data['Going_outside'] / (training_data['Going_outside'].max() + 1e-5)
training_data['online_presence'] = training_data['Post_frequency'] * training_data['Friends_circle_size']

num_attributes = training_data.select_dtypes(include=[np.number]).columns.tolist()
cat_attributes = training_data.select_dtypes(exclude=[np.number]).columns.tolist()


corr_matrix = training_data.select_dtypes(include=[np.number]).corr()
train_data.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [6]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean',missing_values=np.nan)),
    ('scaler', StandardScaler()),])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    ('one_hot_encoding', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

pre_processor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes),
])

pre_processor.fit(training_data)
training_data = pre_processor.transform(training_data)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    training_data, target_data, test_size=0.2,random_state=42,stratify=target_data)

print("Train shape:", X_train.shape)
print(type(X_train))

Train shape: (14819, 13)
<class 'numpy.ndarray'>


In [None]:

metrics = [
    tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
    tf.keras.metrics.Precision(name="precision"),
    tf.keras.metrics.Recall(name="recall") ]


def build_model(hp):
    n_layers = hp.Int("no of layers", 1, 3, default=2)
    n_neurons = hp.Int("no of neurons", 32, 512, step=32, default=128)
    learning_rate = hp.Float("learning rate", 1e-4, 1e-2, sampling='log', default=1e-3)
    optimizer = hp.Choice("optimizer", ['adam', 'sgd'], default='adam')
    
    if optimizer == 'adam':
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        opt = tf.keras.optimizers.SGD(learning_rate=learning_rate)
        
    model = tf.keras.Sequential()
    model.add(InputLayer(shape=(X_train.shape[1],)))
    for _ in range(n_layers):
        model.add(Dense(n_neurons, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=metrics)
    
    return model 

# Encode target labels to 0/1
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

tuner = kt.RandomSearch(build_model, max_trials=5, objective='val_accuracy', overwrite=True, project_name='personality_tuning', seed=42)
tuner.search(X_train, y_train_encoded, epochs=10, validation_data=(X_val, y_val_encoded), batch_size=32)
top_parameters = tuner.get_best_hyperparameters()[0]
print("Best hyperparameters:", top_parameters.values)

Trial 6 Complete [00h 00m 48s]
val_accuracy: 0.28502023220062256

Best val_accuracy So Far: 0.28502023220062256
Total elapsed time: 00h 04m 21s

Search: Running Trial #7

Value             |Best Value So Far |Hyperparameter
1                 |2                 |no of layers
64                |32                |no of neurons
0.0030433         |0.00065625        |learning rate
sgd               |adam              |optimizer

Epoch 1/10
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.2515 - loss: 0.2716 - precision: 0.6552 - recall: 0.8517 - val_accuracy: 0.2850 - val_loss: 0.1749 - val_precision: 0.9376 - val_recall: 0.9492
Epoch 2/10
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.2543 - loss: 0.1718 - precision: 0.9508 - recall: 0.9355 - val_accuracy: 0.2850 - val_loss: 0.1564 - val_precision: 0.9405 - val_recall: 0.9492
Epoch 3/10
[1m464/464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/ste

KeyboardInterrupt: 