# Decision-Making Style Classification

## Objective
To classify users into decision-making styles (Risk-Averse, Balanced, Risk-Seeking)
using behavioral and digital indicators aligned with the Streamlit application.

# Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Load Datasets

In [2]:
ope_train = pd.read_csv("../data/raw/OPE_train.csv")

print("Dataset shape:", ope_train.shape)
ope_train.head()

Dataset shape: (71579, 24)


Unnamed: 0,user_id,strategy_id,gameId,roundNum,user_points,bot_points,last_didGo,last_last_didGo,last_didWin,last_last_didWin,...,positive_part,negative_part,last_round_positive_part,last_round_negative_part,last_last_round_positive_part,last_last_round_negative_part,review_score,last_review_score,last_last_review_score,didGo
0,0,3,0,1,0,0,False,False,False,False,...,"The hotel was spotless, the staff extremely he...",,,,,,10.0,,,False
1,0,3,0,2,0,0,False,False,False,False,...,silence..,"Not only did not like, I HATE it.. Today I had...","The hotel was spotless, the staff extremely he...",,,,3.0,10.0,,True
2,0,3,0,3,0,1,True,False,False,False,...,,"The staircase was extremely steep, with a very...",silence..,"Not only did not like, I HATE it.. Today I had...","The hotel was spotless, the staff extremely he...",,5.4,3.0,10.0,False
3,0,3,0,4,1,1,False,True,True,False,...,"The hotel was immaculately clean, quiet, and c...",,,"The staircase was extremely steep, with a very...",silence..,"Not only did not like, I HATE it.. Today I had...",10.0,5.4,3.0,True
4,0,3,0,5,2,2,True,False,True,True,...,Close to the station and centrally located. In...,Unfortunately too noisy for me. The rooms and ...,"The hotel was immaculately clean, quiet, and c...",,,"The staircase was extremely steep, with a very...",6.0,10.0,5.4,False


# Normalize Column Names

In [3]:
ope_train.columns = (
    ope_train.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

ope_train.columns

Index(['user_id', 'strategy_id', 'gameid', 'roundnum', 'user_points',
       'bot_points', 'last_didgo', 'last_last_didgo', 'last_didwin',
       'last_last_didwin', 'last_reaction_time', 'hotelgood', 'last_hotelgood',
       'last_last_hotelgood', 'positive_part', 'negative_part',
       'last_round_positive_part', 'last_round_negative_part',
       'last_last_round_positive_part', 'last_last_round_negative_part',
       'review_score', 'last_review_score', 'last_last_review_score', 'didgo'],
      dtype='object')

# Select NUMERIC Columns ONLY

In [6]:
numeric_cols = ope_train.select_dtypes(include=["int64", "float64"]).columns
numeric_cols

Index(['user_id', 'strategy_id', 'gameid', 'roundnum', 'user_points',
       'bot_points', 'last_reaction_time', 'review_score', 'last_review_score',
       'last_last_review_score'],
      dtype='object')

# Create Decision Feature DataFrame

In [7]:
decision_df = ope_train[numeric_cols].copy()
decision_df.head()

Unnamed: 0,user_id,strategy_id,gameid,roundnum,user_points,bot_points,last_reaction_time,review_score,last_review_score,last_last_review_score
0,0,3,0,1,0,0,-1,10.0,,
1,0,3,0,2,0,0,58821,3.0,10.0,
2,0,3,0,3,0,1,66297,5.4,3.0,10.0
3,0,3,0,4,1,1,34276,10.0,5.4,3.0
4,0,3,0,5,2,2,16300,6.0,10.0,5.4


# Handle Missing Values

In [8]:
decision_df = decision_df.fillna(decision_df.median())
decision_df.isnull().sum()

user_id                   0
strategy_id               0
gameid                    0
roundnum                  0
user_points               0
bot_points                0
last_reaction_time        0
review_score              0
last_review_score         0
last_last_review_score    0
dtype: int64

# Create Decision Style Label

In [9]:
target_col = decision_df.var().sort_values(ascending=False).index[0]
target_col

'last_reaction_time'

In [10]:
def decision_style(value):
    if value <= decision_df[target_col].quantile(0.33):
        return "Risk-Averse"
    elif value <= decision_df[target_col].quantile(0.66):
        return "Balanced"
    else:
        return "Risk-Seeking"

decision_df["decision_style"] = decision_df[target_col].apply(decision_style)
decision_df.head()


Unnamed: 0,user_id,strategy_id,gameid,roundnum,user_points,bot_points,last_reaction_time,review_score,last_review_score,last_last_review_score,decision_style
0,0,3,0,1,0,0,-1,10.0,9.0,9.0,Risk-Averse
1,0,3,0,2,0,0,58821,3.0,10.0,9.0,Risk-Seeking
2,0,3,0,3,0,1,66297,5.4,3.0,10.0,Risk-Seeking
3,0,3,0,4,1,1,34276,10.0,5.4,3.0,Risk-Seeking
4,0,3,0,5,2,2,16300,6.0,10.0,5.4,Risk-Seeking


# Prepare X and y

In [11]:
X = decision_df.drop("decision_style", axis=1)
y = decision_df["decision_style"]

print(X.shape, y.shape)

(71579, 10) (71579,)


# Train–Test Split

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Random Forest Model

In [13]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf_model.fit(X_train, y_train)


# Evaluate Model

In [14]:
y_pred = rf_model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

    Balanced       1.00      1.00      1.00      4723
 Risk-Averse       1.00      1.00      1.00      4726
Risk-Seeking       1.00      1.00      1.00      4867

    accuracy                           1.00     14316
   macro avg       1.00      1.00      1.00     14316
weighted avg       1.00      1.00      1.00     14316



In [15]:
joblib.dump(rf_model, "../models/decision_rf.pkl")
joblib.dump(X.shape[1], "../models/decision_feature_count.pkl")

['../models/decision_feature_count.pkl']