In [None]:
import polars as pl
import os
import re
import asyncio
import json
import numpy as np
import gc
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_absolute_error
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import IterativeStratification
from tenacity import retry, stop_after_attempt, wait_exponential
from openai import AsyncOpenAI

import nest_asyncio
nest_asyncio.apply()

# ==========================================
# 1. КОНФИГУРАЦИЯ DEEPSEEK
# ==========================================

API_KEY = os.getenv("NOVITA_API_KEY", "khhbjhvsdkvnxndjrnlwekrknkfiyvjkbk-ojo")
BASE_URL = "https://api.novita.ai/openai"
MODEL_NAME = "deepseek/deepseek-v3.2"

client = AsyncOpenAI(
    api_key=API_KEY,
    base_url=BASE_URL,
)

# Задержка между запросами (чтобы не упереться в лимиты)
DELAY_SECONDS = 10
print(f"Using Model: {MODEL_NAME}")

Using Model: deepseek/deepseek-v3.2


In [2]:
# ==========================================
# 2. ПОДГОТОВКА ДАННЫХ
# ==========================================

# --- 2.1. Essay Big5 (OCEAN) ---
print("\n--- Loading Essay Big5 ---")
try:
    essay_paths = [
        "/kaggle/input/essays-big5/essays-big5/test-00000-of-00001.parquet",
        "/kaggle/input/essays-big5/essays-big5/train-00000-of-00001.parquet",
        "/kaggle/input/essays-big5/essays-big5/validation-00000-of-00001.parquet"
    ]
    valid_paths = [p for p in essay_paths if os.path.exists(p)]
    
    if valid_paths:
        full_big5 = pl.concat([pl.read_parquet(p) for p in valid_paths])
        full_big5 = full_big5.drop("ptype") # Убираем лишнее, если есть
        
        # Приводим к Int для стратификации
        full_big5 = full_big5.with_columns([
            pl.col("O").cast(pl.Int64), pl.col("C").cast(pl.Int64), pl.col("E").cast(pl.Int64),
            pl.col("A").cast(pl.Int64), pl.col("N").cast(pl.Int64),
        ])

        # Логика выборки ~800 строк с сохранением распределения (IterativeStratification)
        # Вычисляем процент, необходимый для получения 800 строк
        total_rows = len(full_big5)
        target_rows = 800
        test_ratio = target_rows / total_rows

        X = np.zeros((total_rows, 1)) # Dummy feature
        y = full_big5[["O", "C", "E", "A", "N"]].to_numpy()

        # Делим на 2 части: train (ненужная часть) и sample (наши ~800 строк)
        stratifier = IterativeStratification(n_splits=2, order=1, sample_distribution_per_fold=[test_ratio,1.0 - test_ratio])
        
        # Берем индексы второй части (которая test_ratio)
        _, sample_idx = next(stratifier.split(X, y))
        
        big5_sample = full_big5[sample_idx]
        print(f"Big5 Loaded Total: {total_rows}, Sampled: {len(big5_sample)} (Target ~800)")
    else:
        print("Error: Big5 files not found.")
        big5_sample = pl.DataFrame()

except Exception as e:
    print(f"Error loading Big5: {e}")
    big5_sample = pl.DataFrame()

# --- 2.2. Personae ---
print("\n--- Loading Personae ---")
records = []
folder = "/kaggle/input/personae-corpus/PersonaeCorpus/data"
if os.path.exists(folder):
    for f in os.listdir(folder):
        parts = f.split(".")
        if len(parts) >= 3:
            try:
                with open(os.path.join(folder, f), "r", encoding="utf-8", errors='ignore') as ft:
                    records.append({
                        "id": parts[0], 
                        "gender": parts[1], 
                        "mbti": parts[2], 
                        "text": ft.read()
                    })
            except Exception:
                continue
    personae_df = pl.DataFrame(records)
    print(f"Personae Loaded: {len(personae_df)} rows")
else:
    print("Warning: Personae folder not found.")
    personae_df = pl.DataFrame()

gc.collect()


--- Loading Essay Big5 ---
Big5 Loaded Total: 2467, Sampled: 800 (Target ~800)

--- Loading Personae ---
Personae Loaded: 145 rows


0

In [None]:
# ==========================================
# 3. ASYNC INFERENCE ENGINE
# ==========================================

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def get_llm_response(messages_list):
    try:
        response = await client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages_list,
            max_tokens=1024,
            temperature=0.0, # Ставим 0 для большей детерминированности классификации
            response_format={"type": "json_object"} 
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Request Failed: {e}")
        raise e

async def process_row(row, instruction_fn, parse_fn):
    text_content = row.get('text', '') or row.get('posts', '')
    user_data_text = f"User Text:\n{text_content[:3000]}"
    
    instruction_text = instruction_fn()
    
    messages = [
        {"role": "user", "content": user_data_text},
        {"role": "user", "content": instruction_text} 
    ]

    try:
        reply = await get_llm_response(messages)
        if reply is None: return None
        parsed = parse_fn(reply, row)
        return parsed
    except Exception as e:
        print(f"Failed row: {e}")
        return None

async def run_inference_with_delay(df, instruction_fn, parse_fn, dataset_name):
    if df.is_empty():
        return pl.DataFrame()
        
    print(f"\nStarting inference for {dataset_name} ({len(df)} rows)...")
    results = []
    total_rows = len(df)
    
    for i, row in enumerate(df.iter_rows(named=True)):
        res = await process_row(row, instruction_fn, parse_fn)
        results.append(res)
        
        # Логирование каждые 10 строк или последняя
        if (i+1) % 10 == 0 or i == total_rows - 1:
            print(f"[{dataset_name}] Processed {i+1}/{total_rows}")

        if i < total_rows - 1:
            await asyncio.sleep(DELAY_SECONDS)
    
    valid_results = [r for r in results if r is not None]
    return pl.DataFrame(valid_results)

In [None]:
# ==========================================
# 4. ПРОМПТЫ И ПАРСЕРЫ
# ==========================================

# --- Prompts for Big5 ---
def get_big5_instructions():
    return """Analyze the text provided in the previous message and predict the Big Five personality traits.
Traits: 
1. O (Openness)
2. C (Conscientiousness)
3. E (Extraversion)
4. A (Agreeableness)
5. N (Neuroticism)

Determine if each trait is High (1) or Low (0).

RETURN FORMAT:
You must return a valid JSON object ONLY:
{
  "reasoning": "Short analysis...",
  "O": 1,
  "C": 0,
  "E": 1,
  "A": 1,
  "N": 0
}
"""

def big5_parse(reply, row):
    preds = {"O": 0, "C": 0, "E": 0, "A": 0, "N": 0} # Default
    try:
        clean_reply = reply.replace("```json", "").replace("```", "").strip()
        json_match = re.search(r"\{.*\}", clean_reply, re.DOTALL)
        
        if json_match:
            data = json.loads(json_match.group(0))
            for key in preds.keys():
                if key in data:
                    # Приводим к int (на случай если модель вернет "1" строкой или boolean)
                    val = data[key]
                    if isinstance(val, bool): preds[key] = 1 if val else 0
                    else: preds[key] = int(val)
    except:
        pass # Если JSON сломан, оставляем нули (или можно сделать XXXX для отладки)

    result = {
        "text_id": str(row.get("id", "")), # Если есть ID
    }
    # Добавляем True значения
    for k in ["O", "C", "E", "A", "N"]:
        result[f"{k}_true"] = int(row[k])
        result[f"{k}_pred"] = preds[k]
        
    return result

# --- Prompts for Personae ---
def get_personae_instructions():
    return """Analyze the text to predict the author's Gender and MBTI type.

Gender options: Male, Female
MBTI options: 4-letter code (e.g., INFP, ESTJ)
Evaluate MBTI type based on these 4 dimensions:
1. (E) Extraversion vs (I) Introversion
2. (S) Sensing vs (N) Intuition
3. (T) Thinking vs (F) Feeling
4. (J) Judging vs (P) Perceiving

RETURN FORMAT:
You must return a valid JSON object ONLY:
{
  "reasoning": "Short analysis...",
  "gender": "Female",
  "mbti": "INFP"
}
"""

def personae_parse(reply, row):
    pred_gender = "unknown"
    pred_mbti = "XXXX"
    
    try:
        clean_reply = reply.replace("```json", "").replace("```", "").strip()
        json_match = re.search(r"\{.*\}", clean_reply, re.DOTALL)
        
        if json_match:
            data = json.loads(json_match.group(0))
            
            # Gender Parsing
            if "gender" in data:
                g = str(data["gender"]).strip().lower()
                if "fem" in g: pred_gender = "female"
                elif "mal" in g: pred_gender = "male"
            
            # MBTI Parsing
            if "mbti" in data:
                m = str(data["mbti"]).strip().upper()
                # Простая валидация 4 букв
                if re.match(r"^[IE][NS][TF][JP]$", m): 
                    pred_mbti = m
    except:
        pass

    return {
        "id": row["id"],
        "gender_true": row["gender"].lower().strip(),
        "gender_pred": pred_gender,
        "mbti_true": row["mbti"],
        "mbti_pred": pred_mbti
    }

In [5]:
# ==========================================
# 5. РАСЧЕТ МЕТРИК
# ==========================================

all_metrics_data = []

def calculate_metrics(y_true, y_pred, task_name, dataset_name):
    # Фильтрация валидных предсказаний
    valid_data = [
        (t, p) for t, p in zip(y_true, y_pred) 
        if p is not None and str(p) != "XXXX" and str(p) != "unknown" and p != -1
    ]
    
    if not valid_data:
        print(f"[{dataset_name} - {task_name}] No valid predictions to calculate metrics.")
        return

    y_true_clean = [x[0] for x in valid_data]
    y_pred_clean = [x[1] for x in valid_data]
    n = len(y_true_clean)

    print(f"\n--- Metrics for {dataset_name}: {task_name} (N={n}) ---")
    
    # 1. MBTI Metrics (4 axis)
    is_mbti_task = (task_name == "MBTI")
    
    if is_mbti_task:
        acc = accuracy_score(y_true_clean, y_pred_clean)
        _, _, f1_macro, _ = precision_recall_fscore_support(y_true_clean, y_pred_clean, average='macro', zero_division=0)
        
        print(f"Exact Match Accuracy: {acc:.2%}")
        print(f"Macro F1-Score:       {f1_macro:.2%}")
        
        # Поосевая точность
        axes = ["(I)E", "(N)S", "(T)F", "(J)P"]
        total_letters = 0
        for i in range(4):
            correct = sum(1 for t, p in zip(y_true_clean, y_pred_clean) if t[i] == p[i])
            print(f"Axis {axes[i]}: {correct/n:.2%}")
            total_letters += correct
        print(f"Avg Letters: {total_letters/n:.2f} / 4.0")
        
        all_metrics_data.append({
            "Dataset": dataset_name, "Task": task_name, 
            "Accuracy": acc, "F1_Macro": f1_macro
        })

    # 2. Binary / Simple Classification (Big5 Traits, Gender)
    else:
        acc = accuracy_score(y_true_clean, y_pred_clean)
        # Выбор метода усреднения (binary для 2 классов, weighted для мультикласса типа Gender если там грязь)
        labels = list(set(y_true_clean) | set(y_pred_clean))
        avg_method = 'binary' if len(labels) <= 2 and all(isinstance(x, (int, float, np.number)) for x in labels) else 'weighted'
        
        _, _, f1, _ = precision_recall_fscore_support(y_true_clean, y_pred_clean, average=avg_method, zero_division=0)
        
        print(f"Accuracy: {acc:.2%}")
        print(f"F1-Score: {f1:.2%} ({avg_method})")
        
        all_metrics_data.append({
            "Dataset": dataset_name, "Task": task_name, 
            "Accuracy": acc, "F1": f1
        })



In [6]:
# ==========================================
# 6. MAIN
# ==========================================

async def main():
    # --- A. Process Big5 ---
    if not big5_sample.is_empty():
        res_big5 = await run_inference_with_delay(
            big5_sample, 
            get_big5_instructions, 
            big5_parse, 
            "Big5_Essays"
        )
        
        if not res_big5.is_empty():
            res_big5.write_csv("results_deepseek_big5.csv")
            # Считаем метрики для каждой из 5 черт
            for trait in ["O", "C", "E", "A", "N"]:
                calculate_metrics(
                    res_big5[f"{trait}_true"].to_list(), 
                    res_big5[f"{trait}_pred"].to_list(), 
                    f"Big5_{trait}", 
                    "Big5_Essays"
                )
    
    # --- B. Process Personae ---
    if not personae_df.is_empty():
        # Берем весь датасет Personae (там мало данных)
        res_personae = await run_inference_with_delay(
            personae_df, 
            get_personae_instructions, 
            personae_parse, 
            "Personae_Corpus"
        )
        
        if not res_personae.is_empty():
            res_personae.write_csv("results_deepseek_personae.csv")
            # Метрики для Gender
            calculate_metrics(
                res_personae["gender_true"].to_list(),
                res_personae["gender_pred"].to_list(),
                "Gender",
                "Personae"
            )
            # Метрики для MBTI
            calculate_metrics(
                res_personae["mbti_true"].to_list(),
                res_personae["mbti_pred"].to_list(),
                "MBTI",
                "Personae"
            )

    # --- Summary ---
    if all_metrics_data:
        print("\n=== FINAL METRICS SUMMARY ===")
        metrics_df = pl.DataFrame(all_metrics_data)
        print(metrics_df)
        metrics_df.write_csv("all_metrics_summary_deepseek.csv")

# Запуск
await main()


Starting inference for Big5_Essays (800 rows)...
[Big5_Essays] Processed 10/800
[Big5_Essays] Processed 20/800
[Big5_Essays] Processed 30/800
[Big5_Essays] Processed 40/800
[Big5_Essays] Processed 50/800
[Big5_Essays] Processed 60/800
[Big5_Essays] Processed 70/800
[Big5_Essays] Processed 80/800
[Big5_Essays] Processed 90/800
[Big5_Essays] Processed 100/800
[Big5_Essays] Processed 110/800
[Big5_Essays] Processed 120/800
[Big5_Essays] Processed 130/800
[Big5_Essays] Processed 140/800
[Big5_Essays] Processed 150/800
[Big5_Essays] Processed 160/800
[Big5_Essays] Processed 170/800
[Big5_Essays] Processed 180/800
[Big5_Essays] Processed 190/800
[Big5_Essays] Processed 200/800
[Big5_Essays] Processed 210/800
[Big5_Essays] Processed 220/800
[Big5_Essays] Processed 230/800
[Big5_Essays] Processed 240/800
[Big5_Essays] Processed 250/800
[Big5_Essays] Processed 260/800
[Big5_Essays] Processed 270/800
[Big5_Essays] Processed 280/800
[Big5_Essays] Processed 290/800
[Big5_Essays] Processed 300/800