In [1]:
!pip install --upgrade protobuf==4.25.*
!pip install -U bitsandbytes accelerate
!pip install --upgrade polars
!pip install scikit-multilearn

Collecting protobuf==4.25.*
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 6.33.0
    Uninstalling protobuf-6.33.0:
      Successfully uninstalled protobuf-6.33.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 4.25.8 which is incompatible.
a2a-sdk 0.3.10 requires protobuf>=5.29.5, but you have protobuf 4.25.8 which is incompati

In [2]:
import polars as pl
import os
import re
import torch
import json 
import gc
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_absolute_error, confusion_matrix
from skmultilearn.model_selection import IterativeStratification
from scipy.stats import pearsonr
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sklearn.model_selection import train_test_split
from collections import Counter

torch._dynamo.config.cache_size_limit = 64

In [3]:
# ==========================================
# 1. SETUP & MODEL LOADING (QUANTIZED)
# ==========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "/kaggle/input/qwen-3/transformers/8b/1" 

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
).eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

Using device: cuda


2025-12-07 17:35:48.931732: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765128949.296166      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765128949.409046      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# ==========================================
# 2. DATA LOADING
# ==========================================

# --- 1. Essay Big5 (OCEAN) ---
try:
    essay_paths = [
        "/kaggle/input/essays-big5/essays-big5/test-00000-of-00001.parquet",
        "/kaggle/input/essays-big5/essays-big5/train-00000-of-00001.parquet",
        "/kaggle/input/essays-big5/essays-big5/validation-00000-of-00001.parquet"
    ]
    # Фильтруем существующие пути
    valid_paths = [p for p in essay_paths if os.path.exists(p)]
    
    if valid_paths:
        essay_big5_df = pl.concat([pl.read_parquet(p) for p in valid_paths])
        essay_big5_df = essay_big5_df.drop("ptype")
        
        # Cast labels
        essay_big5_df = essay_big5_df.with_columns([
            pl.col("O").cast(pl.Int64), pl.col("C").cast(pl.Int64), pl.col("E").cast(pl.Int64),
            pl.col("A").cast(pl.Int64), pl.col("N").cast(pl.Int64),
        ])

        # Stratification logic
        X = np.zeros((len(essay_big5_df), 1))
        y = essay_big5_df[["O", "C", "E", "A", "N"]].to_numpy()

        stratifier = IterativeStratification(n_splits=2, order=1, sample_distribution_per_fold=[0.324, 0.676])
        train_idx, sample_idx = next(stratifier.split(X, y))
        
        eb5_sample = essay_big5_df[sample_idx]
        print(f"Big5 Sample shape: {eb5_sample.shape}")
    else:
        print("Warning: Big5 files not found.")
        eb5_sample = pl.DataFrame()
except Exception as e:
    print(f"Error loading Big5: {e}")
    eb5_sample = pl.DataFrame()


# --- 2. MBTI ---
try:
    mbti_path = "/kaggle/input/mbti-type/mbti_1.csv"
    if os.path.exists(mbti_path):
        mbti_df = pl.read_csv(mbti_path)
        y_mbti = mbti_df["type"].to_list()
        
        mbti_train_pd, mbti_sample_pd = train_test_split(
            mbti_df.to_pandas(),
            test_size=0.0922,
            stratify=y_mbti,
            random_state=42
        )
        mbti_sample = pl.from_pandas(mbti_sample_pd)
        print(f"MBTI Sample shape: {mbti_sample.shape}")
    else:
        print("Warning: MBTI file not found.")
        mbti_sample = pl.DataFrame()
except Exception as e:
    print(f"Error loading MBTI: {e}")
    mbti_sample = pl.DataFrame()


# --- 3. Personae ---
records = []
folder = "/kaggle/input/personae-corpus/PersonaeCorpus/data"
if os.path.exists(folder):
    for f in os.listdir(folder):
        parts = f.split(".")
        if len(parts) >= 3:
            try:
                with open(os.path.join(folder, f), "r", encoding="utf-8", errors='ignore') as ft:
                    records.append({
                        "id": parts[0], "gender": parts[1], "mbti": parts[2], "text": ft.read()
                    })
            except Exception:
                continue
    personae_df = pl.DataFrame(records)
    print(f"Personae loaded rows: {len(personae_df)}")
else:
    print("Warning: Personae folder not found.")
    personae_df = pl.DataFrame({"id": [], "gender": [], "mbti": [], "text": []})

# Очистка памяти (удаляем только то, что загружали)
# essay_big5_df удаляем, т.к. создали eb5_sample
if 'essay_big5_df' in locals(): del essay_big5_df
if 'mbti_df' in locals(): del mbti_df
gc.collect()

Big5 Sample shape: (800, 7)
MBTI Sample shape: (800, 2)
Personae loaded rows: 145


152

In [5]:
# ==========================================
# 3. INFERENCE ENGINE
# ==========================================
def run_inference(df, prompt_fn, parse_fn, batch_size=8, max_rows=None):
    if df.is_empty():
        print("Dataframe is empty, skipping inference.")
        return pl.DataFrame()

    if max_rows:
        df = df.head(max_rows)

    results = []
    prompts = []

    print(f"Building prompts for {len(df)} rows...")
    for row in df.iter_rows(named=True):
        user_content = prompt_fn(row)
        messages = [{"role": "user", "content": user_content}]
        rendered = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        prompts.append(rendered)

    print("Starting generation...")
    for i in range(0, len(prompts), batch_size):
        batch_prompts = prompts[i:i+batch_size]
        batch_rows = df[i:i+batch_size].iter_rows(named=True)

        inputs = tokenizer(
            batch_prompts, return_tensors="pt", padding=True, truncation=True, max_length=2048
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs, max_new_tokens=60, pad_token_id=tokenizer.eos_token_id, do_sample=False, use_cache=True
            )

        input_len = inputs.input_ids.shape[1]
        generated = outputs[:, input_len:]
        replies = tokenizer.batch_decode(generated, skip_special_tokens=True)

        for row, reply in zip(batch_rows, replies):
            parsed_data = parse_fn(reply, row)
            results.append(parsed_data)

    return pl.DataFrame(results)

# Глобальный список для метрик
all_metrics_data = []

def calculate_metrics(y_true, y_pred, task_name, dataset_name):
    valid_data = [
        (t, p) for t, p in zip(y_true, y_pred) 
        if p is not None and p != "XXXX" and p != -1 and p != "unknown"
    ]

    if not valid_data:
        print(f"[{dataset_name} - {task_name}] No valid predictions.")
        return

    y_true_clean = [x[0] for x in valid_data]
    y_pred_clean = [x[1] for x in valid_data]
    n = len(y_true_clean)

    print(f"\n--- Metrics for {dataset_name}: {task_name} (N={n}) ---")

    # Проверяем MBTI (строка 4 буквы)
    is_mbti_task = isinstance(y_true_clean[0], str) and len(y_true_clean[0]) == 4 and any(c in "IE" for c in y_true_clean[0])

    if is_mbti_task:
        acc = accuracy_score(y_true_clean, y_pred_clean)
        _, _, f1_macro, _ = precision_recall_fscore_support(y_true_clean, y_pred_clean, average='macro', zero_division=0)

        print(f"Exact Match Accuracy: {acc:.2%}")
        print(f"Macro F1-Score:       {f1_macro:.2%}")

        axes = ["(I)E", "(N)S", "(T)F", "(J)P"]
        axis_scores = []
        total_letters_correct = 0
        
        for i in range(4):
            correct_count = sum(1 for t, p in zip(y_true_clean, y_pred_clean) if t[i] == p[i])
            axis_acc = correct_count / n
            axis_scores.append(axis_acc)
            total_letters_correct += correct_count
            print(f"Axis {axes[i]} Accuracy:       {axis_acc:.2%}")

        avg_letters = total_letters_correct / n
        print(f"Avg Letters Correct:  {avg_letters:.2f} / 4.00")

        all_metrics_data.append({
            "Dataset": dataset_name, "Task": task_name,
            "Accuracy": acc, "F1_Macro": f1_macro,
            "Axis_IE": axis_scores[0], "Axis_NS": axis_scores[1],
            "Axis_TF": axis_scores[2], "Axis_JP": axis_scores[3],
            "Avg_Letters": avg_letters
        })
    else:
        # Big 5 / Gender
        acc = accuracy_score(y_true_clean, y_pred_clean)
        avg_method = 'binary' if len(set(y_true_clean)) <= 2 and isinstance(y_true_clean[0], int) else 'weighted'
        prec, rec, f1, _ = precision_recall_fscore_support(y_true_clean, y_pred_clean, average=avg_method, zero_division=0)
        
        print(f"Accuracy:  {acc:.2%}")
        print(f"F1-Score:  {f1:.2%} ({avg_method})")

        mae, pearson_corr = None, None
        
        if isinstance(y_true_clean[0], (int, float, np.number)):
            mae = mean_absolute_error(y_true_clean, y_pred_clean)
            if len(set(y_true_clean)) > 1 and len(set(y_pred_clean)) > 1:
                pearson_corr, _ = pearsonr(y_true_clean, y_pred_clean)
                print(f"Pearson Corr: {pearson_corr:.4f}")
            print(f"MAE:       {mae:.4f}")

        all_metrics_data.append({
            "Dataset": dataset_name, "Task": task_name,
            "Accuracy": acc, "F1": f1, "Precision": prec, "Recall": rec,
            "MAE": mae, "Pearson": pearson_corr
        })

    print("-" * 30)

In [6]:
# ==========================================
# 4. PROMPTS & PARSERS DEFINITIONS
# ==========================================

# --- Big5 ---
def big5_prompt(row):
    return f"""Analyze the text and predict the Big Five personality traits.
Traits: O (Openness), C (Conscientiousness), E (Extraversion), A (Agreeableness), N (Neuroticism).
Return a valid JSON object with keys "O", "C", "E", "A", "N" and values 0 (Low) or 1 (High).
Example: {{ "O": 1, "C": 0, "E": 1, "A": 1, "N": 0 }}
Text: {row['text'][:2000]}"""

def big5_parse(reply, row):
    preds = {"O": -1, "C": -1, "E": -1, "A": -1, "N": -1}
    try:
        clean_json = re.sub(r"```json|```", "", reply).strip()
        match_json = re.search(r"\{.*\}", clean_json, re.DOTALL)
        if match_json:
            data = json.loads(match_json.group(0))
            for key in preds.keys():
                if key in data: preds[key] = int(data[key])
            return {
                "O_true": int(row["O"]), "C_true": int(row["C"]), "E_true": int(row["E"]),
                "A_true": int(row["A"]), "N_true": int(row["N"]),
                "O_pred": preds["O"], "C_pred": preds["C"], "E_pred": preds["E"],
                "A_pred": preds["A"], "N_pred": preds["N"],
            }
    except:
        pass
    
    # Fallback regex
    pattern = re.compile(r"([OCEAN])[a-z]*\s*[:=\-]\s*(0|1|high|low)", re.IGNORECASE)
    matches = pattern.findall(reply)
    for char, val_str in matches:
        val = 1 if ('1' in val_str or 'high' in val_str.lower()) else 0
        preds[char.upper()] = val

    for k in preds:
        if preds[k] == -1: preds[k] = 0 # Default safe value

    return {
        "O_true": int(row["O"]), "C_true": int(row["C"]), "E_true": int(row["E"]),
        "A_true": int(row["A"]), "N_true": int(row["N"]),
        "O_pred": preds["O"], "C_pred": preds["C"], "E_pred": preds["E"],
        "A_pred": preds["A"], "N_pred": preds["N"],
    }

# --- MBTI ---
def mbti_prompt(row):
    text_snippet = row['posts'][:3000]
    return f"""Analyze the text data provided in the previous message to determine the author's MBTI type.

Evaluate based on these 4 dimensions:
1. (E) Extraversion vs (I) Introversion
2. (S) Sensing vs (N) Intuition
3. (T) Thinking vs (F) Feeling
4. (J) Judging vs (P) Perceiving

Return a JSON object: {{ "predicted_type": "INTJ" }}
Text: "{text_snippet}" """

def mbti_parse(reply, row):
    try:
        clean_reply = reply.replace("```json", "").replace("```", "").strip()
        json_match = re.search(r"\{.*\}", clean_reply, re.DOTALL)
        if json_match:
            data = json.loads(json_match.group(0))
            pred = data.get("predicted_type", "XXXX").upper()
            return {"type_true": row["type"], "type_pred": pred}
        
        strict_match = re.search(r"Type:?\s*\*?\*?([IE][NS][TF][JP])", reply, re.IGNORECASE)
        if strict_match:
            return {"type_true": row["type"], "type_pred": strict_match.group(1).upper()}
            
        simple_match = re.search(r"\b([IE][NS][TF][JP])\b", reply.upper())
        if simple_match:
             return {"type_true": row["type"], "type_pred": simple_match.group(1)}
    except:
        pass
    return {"type_true": row["type"], "type_pred": "XXXX"}

# --- Personae ---
def personae_prompt(row):
    return f"""Analyze the text to predict Gender and MBTI.
Return JSON: {{ "gender": "Female", "mbti": "INFP" }}
Text: {row['text'][:2000]}"""

def personae_parse(reply, row):
    pred_gender, pred_mbti = "unknown", "XXXX"
    try:
        clean_json = re.sub(r"```json|```", "", reply).strip()
        match_json = re.search(r"\{.*\}", clean_json, re.DOTALL)
        if match_json:
            data = json.loads(match_json.group(0))
            if "gender" in data:
                g = str(data["gender"]).strip().lower()
                if "fem" in g: pred_gender = "female"
                elif "mal" in g: pred_gender = "male"
            if "mbti" in data:
                m = str(data["mbti"]).strip().upper()
                if re.match(r"^[IE][NS][TF][JP]$", m): pred_mbti = m
    except:
        pass

    # Regex fallback
    if pred_gender == "unknown":
        g_match = re.search(r"Gender\s*[:=\-]\s*(Male|Female)", reply, re.IGNORECASE)
        if g_match: pred_gender = g_match.group(1).lower()

    if pred_mbti == "XXXX":
        m_match = re.search(r"\b([IE][NS][TF][JP])\b", reply.upper())
        if m_match: pred_mbti = m_match.group(1)

    return {
        "gender_true": row["gender"].lower().strip(), "gender_pred": pred_gender,
        "mbti_true": row["mbti"], "mbti_pred": pred_mbti
    }

In [None]:
# ==========================================
# 5. EXECUTION PIPELINE
# ==========================================

# 1. Run Big5
print("\n=== RUNNING BIG5 INFERENCE ===")
big5_res = run_inference(eb5_sample, big5_prompt, big5_parse)
if not big5_res.is_empty():
    big5_res.write_csv("results_essay_big5.csv")
    for trait in ["O", "C", "E", "A", "N"]:
        calculate_metrics(big5_res[f"{trait}_true"].to_list(), big5_res[f"{trait}_pred"].to_list(), f"Big5_{trait}", "Big5_Essays")

# 2. Run MBTI
print("\n=== RUNNING MBTI INFERENCE ===")
mbti_res = run_inference(mbti_sample, mbti_prompt, mbti_parse)
if not mbti_res.is_empty():
    mbti_res.write_csv("results_mbti.csv")
    calculate_metrics(mbti_res["type_true"].to_list(), mbti_res["type_pred"].to_list(), "MBTI Type", "MBTI_Dataset")

# 3. Run Personae
print("\n=== RUNNING PERSONAE INFERENCE ===")
personae_sample = personae_df.head(50) if len(personae_df) > 50 else personae_df
personae_res = run_inference(personae_sample, personae_prompt, personae_parse)

if not personae_res.is_empty():
    personae_res.write_csv("results_personae.csv")
    calculate_metrics(personae_res["gender_true"].to_list(), personae_res["gender_pred"].to_list(), "Gender", "Personae")
    calculate_metrics(personae_res["mbti_true"].to_list(), personae_res["mbti_pred"].to_list(), "MBTI", "Personae")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



=== RUNNING BIG5 INFERENCE ===
Building prompts for 800 rows...
Starting generation...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'


--- Metrics for Big5_Essays: Big5_O (N=800) ---
Accuracy:  48.38%
F1-Score:  9.23% (binary)
Pearson Corr: -0.0127
MAE:       0.5162
------------------------------

--- Metrics for Big5_Essays: Big5_C (N=800) ---
Accuracy:  49.50%
F1-Score:  2.42% (binary)
Pearson Corr: 0.0236
MAE:       0.5050
------------------------------

--- Metrics for Big5_Essays: Big5_E (N=800) ---
Accuracy:  49.12%
F1-Score:  4.68% (binary)
Pearson Corr: 0.0529
MAE:       0.5088
------------------------------

--- Metrics for Big5_Essays: Big5_A (N=800) ---
Accuracy:  47.88%
F1-Score:  7.54% (binary)
Pearson Corr: 0.0373
MAE:       0.5212
------------------------------

--- Metrics for Big5_Essays: Big5_N (N=800) ---
Accuracy:  49.50%
F1-Score:  9.42% (binary)
Pearson Corr: -0.0260
MAE:       0.5050
------------------------------

=== RUNNING MBTI INFERENCE ===
Building prompts for 800 rows...
Starting generation...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'


--- Metrics for MBTI_Dataset: MBTI Type (N=138) ---
Exact Match Accuracy: 41.30%
Macro F1-Score:       26.36%
Axis (I)E Accuracy:       85.51%
Axis (N)S Accuracy:       84.78%
Axis (T)F Accuracy:       71.74%
Axis (J)P Accuracy:       67.39%
Avg Letters Correct:  3.09 / 4.00
------------------------------

=== RUNNING PERSONAE INFERENCE ===
Building prompts for 50 rows...
Starting generation...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



--- Metrics for Personae: Gender (N=20) ---
Accuracy:  60.00%
F1-Score:  45.00% (weighted)
------------------------------

--- Metrics for Personae: MBTI (N=20) ---
Exact Match Accuracy: 0.00%
Macro F1-Score:       0.00%
Axis (I)E Accuracy:       50.00%
Axis (N)S Accuracy:       55.00%
Axis (T)F Accuracy:       55.00%
Axis (J)P Accuracy:       20.00%
Avg Letters Correct:  1.80 / 4.00
------------------------------


In [8]:
# ==========================================
# 6. SAVE FINAL METRICS
# ==========================================
print("\n=== SAVING METRICS SUMMARY ===")
if all_metrics_data:
    metrics_df = pl.DataFrame(all_metrics_data)
    
    # Округляем числовые колонки
    numeric_cols = [c for c in metrics_df.columns if metrics_df[c].dtype in [pl.Float64, pl.Float32]]
    metrics_df = metrics_df.with_columns([pl.col(c).round(4) for c in numeric_cols])

    print(metrics_df)
    metrics_df.write_csv("all_metrics_summary.csv")
    print("Metrics saved successfully.")
else:
    print("No metrics collected.")


=== SAVING METRICS SUMMARY ===
shape: (8, 14)
┌──────────────┬───────────┬──────────┬────────┬───┬─────────┬─────────┬─────────┬─────────────┐
│ Dataset      ┆ Task      ┆ Accuracy ┆ F1     ┆ … ┆ Axis_NS ┆ Axis_TF ┆ Axis_JP ┆ Avg_Letters │
│ ---          ┆ ---       ┆ ---      ┆ ---    ┆   ┆ ---     ┆ ---     ┆ ---     ┆ ---         │
│ str          ┆ str       ┆ f64      ┆ f64    ┆   ┆ f64     ┆ f64     ┆ f64     ┆ f64         │
╞══════════════╪═══════════╪══════════╪════════╪═══╪═════════╪═════════╪═════════╪═════════════╡
│ Big5_Essays  ┆ Big5_O    ┆ 0.4838   ┆ 0.0923 ┆ … ┆ null    ┆ null    ┆ null    ┆ null        │
│ Big5_Essays  ┆ Big5_C    ┆ 0.495    ┆ 0.0242 ┆ … ┆ null    ┆ null    ┆ null    ┆ null        │
│ Big5_Essays  ┆ Big5_E    ┆ 0.4912   ┆ 0.0468 ┆ … ┆ null    ┆ null    ┆ null    ┆ null        │
│ Big5_Essays  ┆ Big5_A    ┆ 0.4788   ┆ 0.0754 ┆ … ┆ null    ┆ null    ┆ null    ┆ null        │
│ Big5_Essays  ┆ Big5_N    ┆ 0.495    ┆ 0.0942 ┆ … ┆ null    ┆ null    ┆ null   