In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from pathlib import Path

# -----------------------
# 1) –ù–∞–ª–∞—à—Ç—É–≤–∞–Ω–Ω—è
# -----------------------
INPUT_CSV = "TrainingData.csv"
OUTPUT_CSV = "Output.csv"

# –í–ê–® —Å–ø–∏—Å–æ–∫ –∑–º—ñ–Ω–Ω–∏—Ö-–º—ñ—Ç–µ–Ω—ñ–≤ (—è–∫ —É R)
list_variables = [
    "THREAT_up",
    "THREAT_down",
    "citizen_impact",
    "PF_score",
    "PF_US",
    "PF_neg",
]

# –°–∫—ñ–ª—å–∫–∏ –ø–µ—Ä—à–∏—Ö —Ä—è–¥–∫—ñ–≤ –≤–≤–∞–∂–∞—î–º–æ train (—è–∫ —É R)
training_row = 300

# –ö—ñ–ª—å–∫—ñ—Å—Ç—å —Å–ª—ñ–≤ —É —Å–ª–æ–≤–Ω–∏–∫—É (—è–∫ words.to.keep —É R)
TOP_N = 200

# -----------------------
# 2) –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–Ω—è —Ç–∞ –ø—ñ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–∏—Ö
# -----------------------
# –ß–∏—Ç–∞—î–º–æ CSV
kdata = pd.read_csv(INPUT_CSV)

# –û–±–∏—Ä–∞—î–º–æ –ø–æ—Ç—Ä—ñ–±–Ω—ñ –∫–æ–ª–æ–Ω–∫–∏ (—è–∫ —É R)
base_cols = ["ID", "month", "year", "three_sentences"]
missing = [c for c in base_cols + list_variables if c not in kdata.columns]
if missing:
    raise ValueError(f"–£ —Ñ–∞–π–ª—ñ –≤—ñ–¥—Å—É—Ç–Ω—ñ –∫–æ–ª–æ–Ω–∫–∏: {missing}")

kdata = kdata[base_cols + list_variables].copy()

# –ß–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç—É (–Ω–∏–∂–Ω—ñ–π —Ä–µ–≥—ñ—Å—Ç—Ä; –ø—É–Ω–∫—Ç—É–∞—Ü—ñ—é –¥–∞–ª—ñ ¬´–∑'—ó—Å—Ç—å¬ª —Ç–æ–∫–µ–Ω—ñ–∑–∞—Ç–æ—Ä)
kdata["cleaned.body"] = (
    kdata["three_sentences"]
    .fillna("")
    .astype(str)
    .str.lower()
)

# –û–±—á–∏—Å–ª—é—î–º–æ —Ñ–∞–∫—Ç–∏—á–Ω–∏–π —Ä–æ–∑–º—ñ—Ä train/test –∑ —É—Ä–∞—Ö—É–≤–∞–Ω–Ω—è–º –¥–æ–≤–∂–∏–Ω–∏ –¥–∞—Ç–∞—Å–µ—Ç—É
n_docs = len(kdata)
if n_docs <= training_row:
    raise ValueError(
        f"–£ –¥–∞—Ç–∞—Å–µ—Ç—ñ –ª–∏—à–µ {n_docs} —Ä—è–¥–∫—ñ–≤ ‚Äî –ø–æ–≤–∏–Ω–Ω–æ –±—É—Ç–∏ > {training_row} –¥–ª—è –Ω–∞—è–≤–Ω–æ—Å—Ç—ñ —Ç–µ—Å—Ç–æ–≤–æ—ó —á–∞—Å—Ç–∏–Ω–∏."
    )

train_idx = slice(0, training_row)
test_idx = slice(training_row, n_docs)

docs = kdata["cleaned.body"].tolist()

# -----------------------
# 3) –°–ª–æ–≤–Ω–∏–∫ –∑–∞ –¥–æ–∫—É–º–µ–Ω—Ç–Ω–æ—é —á–∞—Å—Ç–æ—Ç–æ—é (–∞–Ω–∞–ª–æ–≥ docfreq + top-200)
# -----------------------
# –°–ø–æ—á–∞—Ç–∫—É –ª—ñ—á–∏–ª—å–Ω–∏–∫ –ø—Ä–∏—Å—É—Ç–Ω–æ—Å—Ç—ñ —Ç–µ—Ä–º—ñ–Ω—É –≤ –¥–æ–∫—É–º–µ–Ω—Ç—ñ (binary=True)
count_vec = CountVectorizer(stop_words="english", binary=True)
X_bin = count_vec.fit_transform(docs)
vocab = np.array(count_vec.get_feature_names_out())

# –î–æ–∫—É–º–µ–Ω—Ç–Ω–∞ —á–∞—Å—Ç–æ—Ç–∞ = —Å—É–º–∞ –ø–æ —Å—Ç–æ–≤–ø—Ü—é (–æ—Å–∫—ñ–ª—å–∫–∏ binary=True)
doc_freq = np.asarray(X_bin.sum(axis=0)).ravel()

# –¢–æ–ø-N —Å–ª—ñ–≤ –∑–∞ —Å–ø–∞–¥–∞–Ω–Ω—è–º doc_freq
order = np.argsort(-doc_freq)
top_terms = vocab[order[: min(TOP_N, len(vocab))]]
top_terms_set = set(top_terms)

# -----------------------
# 4) TF-IDF –∑–∞ —Ñ—ñ–∫—Å–æ–≤–∞–Ω–∏–º —Å–ª–æ–≤–Ω–∏–∫–æ–º top_terms (—è–∫ dfm_tfidf —É R –Ω–∞ kdtm2)
# -----------------------
tfidf_vec = TfidfVectorizer(stop_words="english", vocabulary=sorted(top_terms_set))
X_tfidf = tfidf_vec.fit_transform(docs)  # fit –Ω–∞ —Å–ª–æ–≤–Ω–∏–∫—É, –Ω–µ –Ω–∞ —Ç–µ–∫—Å—Ç–∞—Ö (OK, —Å–ª–æ–≤–Ω–∏–∫ —Ñ—ñ–∫—Å–æ–≤–∞–Ω–∏–π)

# –ú–∞—Ç—Ä–∏—Ü—ñ –¥–ª—è train/test
X_train = X_tfidf[train_idx]
X_test = X_tfidf[test_idx]

# -----------------------
# 5) –ù–∞–≤—á–∞–Ω–Ω—è SVM —ñ –ø—Ä–æ–≥–Ω–æ–∑ —ñ–º–æ–≤—ñ—Ä–Ω–æ—Å—Ç–µ–π –¥–ª—è –∫–æ–∂–Ω–æ—ó —Ü—ñ–ª—ñ
# -----------------------
# –Ü–Ω—ñ—Ü—ñ–∞–ª—ñ–∑—É—î–º–æ –∫–æ–ª–æ–Ω–∫–∏ –¥–ª—è –ø—Ä–æ–≥–Ω–æ–∑—ñ–≤ —è–∫ NaN
for target in list_variables:
    col_name = f"predicted.values_{target}"
    kdata[col_name] = np.nan

# –î–æ–ø–æ–º—ñ–∂–Ω–∞ —Ñ—É–Ω–∫—Ü—ñ—è: –ø—Ä–∏–≤–µ—Å—Ç–∏ y –¥–æ –±—ñ–Ω–∞—Ä–Ω–∏—Ö –º—ñ—Ç–æ–∫ {0,1} —è–∫ —É R
def prepare_labels(y_raw):
    """
    - –ü–æ—Ä–æ–∂–Ω—ñ/NaN -> 0
    - –Ø–∫—â–æ –∑–Ω–∞—á–µ–Ω–Ω—è –≤–∏–≥–ª—è–¥–∞—é—Ç—å —è–∫ —á–∏—Å–ª–∞ -> 0/1
    - –Ü–Ω–∞–∫—à–µ —Ä–æ–±–∏–º–æ –ª–µ–π–±–ª-–µ–Ω–∫–æ–¥–∏–Ω–≥ —ñ –ø–æ–ø–µ—Ä–µ–¥–∂–∞—î–º–æ, —è–∫—â–æ –±—ñ–ª—å—à–µ 2 –∫–ª–∞—Å—ñ–≤
    –ü–æ–≤–µ—Ä—Ç–∞—î: y (np.array), name_of_positive_class (–¥–ª—è –≤–∏–±–æ—Ä—É proba[:,1])
    """
    y = pd.Series(y_raw).copy()
    y = y.fillna(0)

    # —è–∫—â–æ –∑–Ω–∞—á–µ–Ω–Ω—è —Ä—è–¥–∫–æ–≤—ñ, —Å–ø—Ä–æ–±—É—î–º–æ –ø—Ä–∏–≤–µ—Å—Ç–∏ "0"/"1" –¥–æ int
    def to_num(v):
        try:
            # –ø—Ä–∏–±–∏—Ä–∞—î–º–æ –ø—Ä–æ–±—ñ–ª–∏
            s = str(v).strip()
            # —ñ–Ω–∫–æ–ª–∏ –º–æ–∂—É—Ç—å –±—É—Ç–∏ "0.0"/"1.0"
            return int(float(s))
        except:
            return v

    y = y.map(to_num)

    # –Ø–∫—â–æ –≤–∂–µ {0,1}
    unique_vals = sorted(pd.Series(y).dropna().unique().tolist())
    if set(unique_vals).issubset({0,1}):
        return np.array(y, dtype=int), 1  # –ø–æ–∑–∏—Ç–∏–≤–Ω–∏–π –∫–ª–∞—Å = 1

    # –Ü–Ω–∞–∫—à–µ —Ä–æ–±–∏–º–æ –ª–µ–π–±–ª-–µ–Ω–∫–æ–¥–∏–Ω–≥
    le = LabelEncoder()
    y_enc = le.fit_transform(y.astype(str))
    classes = list(le.classes_)
    if len(classes) != 2:
        print(
            f"[–£–í–ê–ì–ê] –¶—ñ–ª—å –º–∞—î {len(classes)} –∫–ª–∞—Å(–∏): {classes}. "
            f"SVM –∑ proba –∫–æ—Ä–µ–∫—Ç–Ω–æ –ø—Ä–∞—Ü—é–≤–∞—Ç–∏–º–µ, –∞–ª–µ —ñ–Ω—Ç–µ—Ä–ø—Ä–µ—Ç–∞—Ü—ñ—è '–ø–æ–∑–∏—Ç–∏–≤–Ω–æ–≥–æ' –∫–ª–∞—Å—É —É–º–æ–≤–Ω–∞."
        )
    # –ø–æ–∑–∏—Ç–∏–≤–Ω–∏–º –≤–≤–∞–∂–∞—î–º–æ –∫–ª–∞—Å –∑ —ñ–Ω–¥–µ–∫—Å–æ–º 1 (–¥—Ä—É–≥–∏–π —É –ª–µ–∫—Å–∏–∫–æ–≥—Ä–∞—Ñ—ñ—á–Ω–æ–º—É –ø–æ—Ä—è–¥–∫—É)
    positive_label_name = classes[1] if len(classes) >= 2 else classes[0]
    return y_enc, positive_label_name

# –¢—Ä–µ–Ω—É—î–º–æ –º–æ–¥–µ–ª—ñ
for target in list_variables:
    # y –¥–ª—è train
    y_raw = kdata.loc[train_idx, target]
    y_train, positive_ref = prepare_labels(y_raw)

    # –ú–æ–¥–µ–ª—å SVM (–∞–Ω–∞–ª–æ–≥ ksvm(..., kernel="rbfdot", C=50, prob.model=TRUE))
    clf = SVC(kernel="rbf", C=50, probability=True, random_state=42)

    # –ó–∞ –∞–Ω–∞–ª–æ–≥—ñ—î—é –∑ cross=10 —É ksvm: –æ–±—á–∏—Å–ª–∏–º–æ 10-fold CV-accuracy (–æ–ø—Ü—ñ–π–Ω–æ)
    try:
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        cv_scores = cross_val_score(clf, X_train, y_train, cv=skf, scoring="accuracy")
        print(f"[{target}] CV (10-fold) accuracy: {cv_scores.mean():.3f} ¬± {cv_scores.std():.3f}")
    except Exception as e:
        print(f"[{target}] –ù–µ–º–æ–∂–ª–∏–≤–æ –ø–æ—Ä–∞—Ö—É–≤–∞—Ç–∏ CV: {e}")

    # –ù–∞–≤—á–∞–Ω–Ω—è –Ω–∞ –≤—Å—å–æ–º—É train —ñ –ø—Ä–æ–≥–Ω–æ–∑ –¥–ª—è test
    clf.fit(X_train, y_train)
    proba = clf.predict_proba(X_test)  # shape: (n_test, n_classes)

    # –í–∏–±–∏—Ä–∞—î–º–æ "–¥—Ä—É–≥—É" –∫–æ–ª–æ–Ω–∫—É —è–∫ —É R: predict(..., type="probabilities")[,2]
    # –£ sklearn –¥—Ä—É–≥–∏–π —Å—Ç–æ–≤–ø—á–∏–∫ –≤—ñ–¥–ø–æ–≤—ñ–¥–∞—î –∫–ª–∞—Å—É clf.classes_[1]
    # –Ø–∫—â–æ –º–∏ –∫–æ–¥—É–≤–∞–ª–∏ —É {0,1} ‚Äî —Ü–µ –±—É–¥–µ proba –∫–ª–∞—Å—É 1 (—Ç–æ–±—Ç–æ "–ø–æ–∑–∏—Ç–∏–≤–Ω–∏–π")
    positive_class_index = 1 if proba.shape[1] > 1 else 0
    pos_proba = proba[:, positive_class_index]

    # –ó–∞–ø–∏—Å—É—î–º–æ —É –∫–æ–ª–æ–Ω–∫—É predicted.values_<target> —Ç—ñ–ª—å–∫–∏ –¥–ª—è —Ç–µ—Å—Ç–æ–≤–æ—ó —á–∞—Å—Ç–∏–Ω–∏
    kdata.loc[test_idx, f"predicted.values_{target}"] = pos_proba

# -----------------------
# 6) –ü—Ä–∏–±–∏—Ä–∞–Ω–Ω—è —Ç–µ—Ö–Ω—ñ—á–Ω–æ—ó –∫–æ–ª–æ–Ω–∫–∏ —ñ –∑–±–µ—Ä–µ–∂–µ–Ω–Ω—è
# -----------------------
kdata = kdata.drop(columns=["cleaned.body"])
kdata.to_csv(OUTPUT_CSV, index=False)
print(f"–ì–æ—Ç–æ–≤–æ! –†–µ–∑—É–ª—å—Ç–∞—Ç –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É {Path(OUTPUT_CSV).resolve()}")


FileNotFoundError: [Errno 2] No such file or directory: 'TrainingData.csv'

In [2]:
!pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------------ ------------------------- 20.5/60.8 kB 330.3 kB/s eta 0:00:01
     ------------------------- ------------ 41.0/60.8 kB 495.5 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 543.8 kB/s eta 0:00:00
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp311-cp311-win_amd64.whl (8.9 MB)
   ---------------------------------------- 0.0/8.9 MB ? eta -:--:--
    --------------------------------------- 0.1/8.9 MB 4.3 MB/s eta 0:00:03
   - -------------------------------


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\5103_6\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
import os

# 1. –°—Ç–≤–æ—Ä—é—î–º–æ –ø–∞–ø–∫—É data
os.makedirs("data", exist_ok=True)

# 2. –í–º—ñ—Å—Ç CSV
csv_content = """ID,month,year,three_sentences,THREAT_up,THREAT_down,citizen_impact,PF_score,PF_US,PF_neg
1,Jan,2025,"Threat is increasing in the north sector.",1,0,0,0,0,0
2,Jan,2025,"No threat detected in this area.",0,0,0,0,0,0
3,Jan,2025,"Citizens reported some impact on infrastructure.",0,0,1,0,0,0
4,Jan,2025,"Power failure reported in southern district.",0,0,0,0,0,0
5,Jan,2025,"No negative sentiment detected in reports.",0,0,0,0,0,1
6,Jan,2025,"US related issues are increasing according to data.",1,0,0,0,1,0
7,Jan,2025,"Threat level down in eastern front.",0,1,0,0,0,0
8,Jan,2025,"Citizen impact is minimal at this stage.",0,0,1,0,0,0
9,Jan,2025,"PF score shows moderate risk.",1,0,0,1,0,0
10,Jan,2025,"Everything seems calm, no events.",0,0,0,0,0,0
"""

# 3. –°—Ç–≤–æ—Ä—é—î–º–æ —Ñ–∞–π–ª —É –ø–∞–ø—Ü—ñ data
with open("data/TrainingData.csv", "w", encoding="utf-8") as f:
    f.write(csv_content)

print("‚úÖ –§–∞–π–ª TrainingData.csv —Å—Ç–≤–æ—Ä–µ–Ω–æ –≤ –ø–∞–ø—Ü—ñ data")


‚úÖ –§–∞–π–ª TrainingData.csv —Å—Ç–≤–æ—Ä–µ–Ω–æ –≤ –ø–∞–ø—Ü—ñ data


In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from pathlib import Path

# -----------------------
# 1) –ù–∞–ª–∞—à—Ç—É–≤–∞–Ω–Ω—è
# -----------------------
INPUT_CSV = "data/TrainingData.csv"
OUTPUT_CSV = "Output.csv"

# –°–ø–∏—Å–æ–∫ —Ü—ñ–ª—å–æ–≤–∏—Ö –∑–º—ñ–Ω–Ω–∏—Ö
list_variables = ["THREAT_up", "THREAT_down", "citizen_impact", "PF_score", "PF_US", "PF_neg"]

# –°–∫—ñ–ª—å–∫–∏ –ø–µ—Ä—à–∏—Ö —Ä—è–¥–∫—ñ–≤ –±–µ—Ä–µ–º–æ –Ω–∞ —Ç—Ä–µ–Ω—É–≤–∞–Ω–Ω—è
training_row = 300
TOP_N = 200

# -----------------------
# 0) –Ø–∫—â–æ —Ñ–∞–π–ª—É –Ω–µ–º–∞—î ‚Äì —Å—Ç–≤–æ—Ä—é—î–º–æ —Ç–µ—Å—Ç–æ–≤–∏–π
# -----------------------
if not os.path.exists(INPUT_CSV):
    os.makedirs("data", exist_ok=True)
    csv_content = """ID,month,year,three_sentences,THREAT_up,THREAT_down,citizen_impact,PF_score,PF_US,PF_neg
1,Jan,2025,"Threat is increasing in the north sector.",1,0,0,0,0,0
2,Jan,2025,"No threat detected in this area.",0,0,0,0,0,0
3,Jan,2025,"Citizens reported some impact on infrastructure.",0,0,1,0,0,0
4,Jan,2025,"Power failure reported in southern district.",0,0,0,0,0,0
5,Jan,2025,"No negative sentiment detected in reports.",0,0,0,0,0,1
6,Jan,2025,"US related issues are increasing according to data.",1,0,0,0,1,0
7,Jan,2025,"Threat level down in eastern front.",0,1,0,0,0,0
8,Jan,2025,"Citizen impact is minimal at this stage.",0,0,1,0,0,0
9,Jan,2025,"PF score shows moderate risk.",1,0,0,1,0,0
10,Jan,2025,"Everything seems calm, no events.",0,0,0,0,0,0
"""
    with open(INPUT_CSV, "w", encoding="utf-8") as f:
        f.write(csv_content)
    print(f"‚ö†Ô∏è  –§–∞–π–ª {INPUT_CSV} –Ω–µ –∑–Ω–∞–π–¥–µ–Ω–æ ‚Äî —Å—Ç–≤–æ—Ä–µ–Ω–æ —Ç–µ—Å—Ç–æ–≤–∏–π –ø—Ä–∏–∫–ª–∞–¥ —ñ–∑ 10 —Ä—è–¥–∫–∞–º–∏.")

# -----------------------
# 2) –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–Ω—è –¥–∞–Ω–∏—Ö
# -----------------------
kdata = pd.read_csv(INPUT_CSV)

# –ê–≤—Ç–æ–º–∞—Ç–∏—á–Ω–æ –ø—ñ–¥–ª–∞—à—Ç–æ–≤—É—î–º–æ training_row —è–∫—â–æ —Ä—è–¥–∫—ñ–≤ –º–µ–Ω—à–µ 300
if len(kdata) < training_row:
    training_row = len(kdata) // 2
    print(f"‚ö†Ô∏è  training_row –∑–º–µ–Ω—à–µ–Ω–æ –¥–æ {training_row}, –±–æ —Ä—è–¥–∫—ñ–≤ —É —Ñ–∞–π–ª—ñ –ª–∏—à–µ {len(kdata)}.")

base_cols = ["ID", "month", "year", "three_sentences"]
kdata = kdata[base_cols + list_variables].copy()

# –ß–∏—Å—Ç–∏–º–æ —Ç–µ–∫—Å—Ç
kdata["cleaned.body"] = kdata["three_sentences"].fillna("").astype(str).str.lower()

docs = kdata["cleaned.body"].tolist()
n_docs = len(docs)
train_idx = slice(0, training_row)
test_idx = slice(training_row, n_docs)

# -----------------------
# 3) –¢–æ–ø-200 —Ç–µ—Ä–º—ñ–Ω—ñ–≤ –∑–∞ document frequency
# -----------------------
count_vec = CountVectorizer(stop_words="english", binary=True)
X_bin = count_vec.fit_transform(docs)
vocab = np.array(count_vec.get_feature_names_out())
doc_freq = np.asarray(X_bin.sum(axis=0)).ravel()

order = np.argsort(-doc_freq)
top_terms = vocab[order[:min(TOP_N, len(vocab))]]
tfidf_vec = TfidfVectorizer(stop_words="english", vocabulary=sorted(set(top_terms)))
X_tfidf = tfidf_vec.fit_transform(docs)

X_train = X_tfidf[train_idx]
X_test = X_tfidf[test_idx]

# -----------------------
# 4) –ù–∞–≤—á–∞–Ω–Ω—è –º–æ–¥–µ–ª–µ–π
# -----------------------
for target in list_variables:
    y = kdata.loc[train_idx, target].fillna(0).astype(int)

    clf = SVC(kernel="rbf", C=50, probability=True, random_state=42)
    if len(set(y)) > 1:
        skf = StratifiedKFold(n_splits=min(5, len(set(y))*2), shuffle=True, random_state=42)
        scores = cross_val_score(clf, X_train, y, cv=skf, scoring="accuracy")
        print(f"[{target}] CV accuracy: {scores.mean():.3f} ¬± {scores.std():.3f}")

    clf.fit(X_train, y)
    if X_test.shape[0] > 0:
        probs = clf.predict_proba(X_test)[:, 1]  # –π–º–æ–≤—ñ—Ä–Ω—ñ—Å—Ç—å –∫–ª–∞—Å—É 1
        kdata.loc[test_idx, f"predicted.values_{target}"] = probs
    else:
        kdata[f"predicted.values_{target}"] = np.nan

kdata = kdata.drop(columns=["cleaned.body"])
kdata.to_csv(OUTPUT_CSV, index=False)
print(f"‚úÖ –ì–æ—Ç–æ–≤–æ! –†–µ–∑—É–ª—å—Ç–∞—Ç –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É {Path(OUTPUT_CSV).resolve()}")


‚ö†Ô∏è  training_row –∑–º–µ–Ω—à–µ–Ω–æ –¥–æ 5, –±–æ —Ä—è–¥–∫—ñ–≤ —É —Ñ–∞–π–ª—ñ –ª–∏—à–µ 10.


ValueError: Found input variables with inconsistent numbers of samples: [5, 6]

In [2]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC

# -----------------------
# 1) –ù–∞–ª–∞—à—Ç—É–≤–∞–Ω–Ω—è
# -----------------------
INPUT_CSV = "data/TrainingData.csv"
OUTPUT_CSV = "Output.csv"
LIST_VARIABLES = ["THREAT_up", "THREAT_down", "citizen_impact", "PF_score", "PF_US", "PF_neg"]
TOP_N = 200          # —Å–∫—ñ–ª—å–∫–∏ —Ç–µ—Ä–º—ñ–Ω—ñ–≤ –ª–∏—à–∏—Ç–∏ –∑–∞ docfreq
DEFAULT_TRAIN = 300  # —è–∫ —É R-—Å–∫—Ä–∏–ø—Ç—ñ

# -----------------------
# 0) –ê–≤—Ç–æ–≥–µ–Ω–µ—Ä–∞—Ü—ñ—è —Ç–µ—Å—Ç–æ–≤–∏—Ö –¥–∞–Ω–∏—Ö, —è–∫—â–æ —Ñ–∞–π–ª—É –Ω–µ–º–∞—î
# -----------------------
def ensure_sample_csv(path: str):
    if os.path.exists(path):
        return
    os.makedirs(os.path.dirname(path), exist_ok=True)
    csv_content = """ID,month,year,three_sentences,THREAT_up,THREAT_down,citizen_impact,PF_score,PF_US,PF_neg
1,Jan,2025,"Threat is increasing in the north sector.",1,0,0,0,0,0
2,Jan,2025,"No threat detected in this area.",0,0,0,0,0,0
3,Jan,2025,"Citizens reported some impact on infrastructure.",0,0,1,0,0,0
4,Jan,2025,"Power failure reported in southern district.",0,0,0,0,0,0
5,Jan,2025,"No negative sentiment detected in reports.",0,0,0,0,0,1
6,Jan,2025,"US related issues are increasing according to data.",1,0,0,0,1,0
7,Jan,2025,"Threat level down in eastern front.",0,1,0,0,0,0
8,Jan,2025,"Citizen impact is minimal at this stage.",0,0,1,0,0,0
9,Jan,2025,"PF score shows moderate risk.",1,0,0,1,0,0
10,Jan,2025,"Everything seems calm, no events.",0,0,0,0,0,0
"""
    with open(path, "w", encoding="utf-8") as f:
        f.write(csv_content)
    print(f"‚ö†Ô∏è  {path} –Ω–µ –∑–Ω–∞–π–¥–µ–Ω–æ ‚Äî —Å—Ç–≤–æ—Ä–µ–Ω–æ —Ç–µ—Å—Ç–æ–≤–∏–π —Ñ–∞–π–ª —ñ–∑ 10 —Ä—è–¥–∫–∞–º–∏.")

ensure_sample_csv(INPUT_CSV)

# -----------------------
# 2) –ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–Ω—è —Ç–∞ –±–∞–∑–æ–≤–∞ –ø—ñ–¥–≥–æ—Ç–æ–≤–∫–∞
# -----------------------
df = pd.read_csv(INPUT_CSV)
base_cols = ["ID", "month", "year", "three_sentences"]

missing_cols = [c for c in base_cols + LIST_VARIABLES if c not in df.columns]
if missing_cols:
    raise ValueError(f"–£ —Ñ–∞–π–ª—ñ –≤—ñ–¥—Å—É—Ç–Ω—ñ –Ω–µ–æ–±—Ö—ñ–¥–Ω—ñ –∫–æ–ª–æ–Ω–∫–∏: {missing_cols}")

df = df[base_cols + LIST_VARIABLES].copy()
df["cleaned.body"] = df["three_sentences"].fillna("").astype(str).str.lower()

n_docs = len(df)
if n_docs < 2:
    raise ValueError("–ü–æ—Ç—Ä—ñ–±–Ω–æ —â–æ–Ω–∞–π–º–µ–Ω—à–µ 2 —Ä—è–¥–∫–∏ (–æ–¥–∏–Ω –¥–ª—è train, –æ–¥–∏–Ω –¥–ª—è test).")

# training_row: —è–∫—â–æ –º–∞–ª–æ —Ä—è–¥–∫—ñ–≤ ‚Äî –±–µ—Ä–µ–º–æ –ø–æ–ª–æ–≤–∏–Ω—É (–º—ñ–Ω—ñ–º—É–º 1)
training_row = min(DEFAULT_TRAIN, max(1, n_docs // 2))
train_idx = slice(0, training_row)
test_idx = slice(training_row, n_docs)

docs = df["cleaned.body"].tolist()

# -----------------------
# 3) –°–ª–æ–≤–Ω–∏–∫ –∑–∞ document frequency ‚Üí TF-IDF
# -----------------------
count_vec = CountVectorizer(stop_words="english", binary=True)
X_bin = count_vec.fit_transform(docs)
vocab = np.array(count_vec.get_feature_names_out())

if vocab.size == 0:
    raise ValueError("–ü—ñ—Å–ª—è —Ç–æ–∫–µ–Ω—ñ–∑–∞—Ü—ñ—ó —Å–ª–æ–≤–Ω–∏–∫ –ø–æ—Ä–æ–∂–Ω—ñ–π. –ü–µ—Ä–µ–≤—ñ—Ä—Ç–µ —Ç–µ–∫—Å—Ç–∏ –∞–±–æ –≤–∏–º–∫–Ω—ñ—Ç—å —Å—Ç–æ–ø-—Å–ª–æ–≤–∞.")

doc_freq = np.asarray(X_bin.sum(axis=0)).ravel()
order = np.argsort(-doc_freq)
top_terms = vocab[order[: min(TOP_N, vocab.size)]]
vocab_fixed = sorted(set(top_terms))

tfidf_vec = TfidfVectorizer(stop_words="english", vocabulary=vocab_fixed)
X_tfidf = tfidf_vec.fit_transform(docs)

X_train = X_tfidf[train_idx]
X_test = X_tfidf[test_idx]

print(f"üìä Docs: {n_docs} | Train: {X_train.shape[0]} | Test: {X_test.shape[0]} | Vocab: {len(vocab_fixed)}")

# -----------------------
# 4) –î–æ–ø–æ–º—ñ–∂–Ω—ñ —Ñ—É–Ω–∫—Ü—ñ—ó
# -----------------------
def to_binary_labels(series: pd.Series) -> np.ndarray:
    """
    –ü–µ—Ä–µ—Ç–≤–æ—Ä—é—î —Ü—ñ–ª—å —É –±—ñ–Ω–∞—Ä–Ω—ñ –º—ñ—Ç–∫–∏ {0,1}.
    –ù–ß/–ø–æ—Ä–æ–∂–Ω—ñ ‚Üí 0. –ë—É–¥—å-—â–æ –Ω–µ 0 ‚Üí 1.
    """
    y = pd.to_numeric(series, errors="coerce").fillna(0)
    y = (y != 0).astype(int)
    return y.to_numpy()

def safe_cv(clf, X, y) -> None:
    """
    –û–±—á–∏—Å–ª—é—î k-fold CV —Ç—ñ–ª—å–∫–∏ —è–∫—â–æ –≤–∏—Å—Ç–∞—á–∞—î –¥–∞–Ω–∏—Ö —É –Ω–∞–π–º–µ–Ω—à–æ–º—É –∫–ª–∞—Å—ñ.
    """
    unique, counts = np.unique(y, return_counts=True)
    class_counts = dict(zip(unique.tolist(), counts.tolist()))
    if len(unique) < 2:
        print(f"‚Ä¢ CV –ø—Ä–æ–ø—É—â–µ–Ω–æ: –ª–∏—à–µ –æ–¥–∏–Ω –∫–ª–∞—Å —É train. –ö–ª–∞—Å–∏: {class_counts}")
        return
    min_class = counts.min()
    n_train = X.shape[0]
    # –ú–∞—î–º–æ –º–∞—Ç–∏ —â–æ–Ω–∞–π–º–µ–Ω—à–µ 2 –∑—Ä–∞–∑–∫–∏ –≤ –º—ñ–Ω—ñ–º–∞–ª—å–Ω–æ–º—É –∫–ª–∞—Å—ñ —ñ ‚â•4 –∑–∞–≥–∞–ª–æ–º
    if min_class < 2 or n_train < 4:
        print(f"‚Ä¢ CV –ø—Ä–æ–ø—É—â–µ–Ω–æ: –∑–∞–º–∞–ª–æ –¥–∞–Ω–∏—Ö (min_class={min_class}, n_train={n_train}). –ö–ª–∞—Å–∏: {class_counts}")
        return
    n_splits = min(5, int(min_class))
    if n_splits < 2:
        print(f"‚Ä¢ CV –ø—Ä–æ–ø—É—â–µ–Ω–æ: n_splits={n_splits} < 2.")
        return
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=skf, scoring="accuracy")
    print(f"‚Ä¢ CV ({n_splits}-fold) accuracy: {scores.mean():.3f} ¬± {scores.std():.3f}")

# -----------------------
# 5) –ù–∞–≤—á–∞–Ω–Ω—è –º–æ–¥–µ–ª–µ–π —ñ –ø—Ä–æ–≥–Ω–æ–∑–∏
# -----------------------
for target in LIST_VARIABLES:
    print(f"\nüß† –ú–æ–¥–µ–ª—å –¥–ª—è —Ü—ñ–ª—ñ: {target}")
    y_train = to_binary_labels(df.loc[train_idx, target])

    # –ü–µ—Ä–µ–≤—ñ—Ä–∫–∞ —É–∑–≥–æ–¥–∂–µ–Ω–æ—Å—Ç—ñ
    if X_train.shape[0] != len(y_train):
        raise ValueError(
            f"–ù–µ—Å—É–º—ñ—Å–Ω—ñ —Ä–æ–∑–º—ñ—Ä–∏: X_train={X_train.shape[0]} vs y_train={len(y_train)}. "
            f"–ü–µ—Ä–µ–≤—ñ—Ä—Ç–µ —Ñ–æ—Ä–º—É–≤–∞–Ω–Ω—è train_idx/test_idx."
        )

    clf = SVC(kernel="rbf", C=50, probability=True, random_state=42)

    # –ë–µ–∑–ø–µ—á–Ω–∏–π CV (–∑–∞ –Ω–∞—è–≤–Ω–æ—Å—Ç—ñ –¥–æ—Å—Ç–∞—Ç–Ω—ñ—Ö –¥–∞–Ω–∏—Ö)
    safe_cv(clf, X_train, y_train)

    # –ù–∞–≤—á–∞–Ω–Ω—è –Ω–∞ –≤—Å—å–æ–º—É train
    clf.fit(X_train, y_train)

    # –ü—Ä–æ–≥–Ω–æ–∑–∏ –¥–ª—è —Ç–µ—Å—Ç—É (–π–º–æ–≤—ñ—Ä–Ω—ñ—Å—Ç—å –∫–ª–∞—Å—É 1)
    col_name = f"predicted.values_{target}"
    df[col_name] = np.nan
    if X_test.shape[0] > 0:
        proba = clf.predict_proba(X_test)
        # —ñ–Ω–¥–µ–∫—Å –∫–ª–∞—Å—É "1" (—è–∫—â–æ —á–æ–º—É—Å—å –π–æ–≥–æ –Ω–µ–º–∞—î ‚Äî –±–µ—Ä–µ–º–æ –æ—Å—Ç–∞–Ω–Ω—é –∫–æ–ª–æ–Ω–∫—É)
        classes = clf.classes_
        pos_index = int(np.where(classes == 1)[0][0]) if 1 in classes else (proba.shape[1] - 1)
        df.loc[test_idx, col_name] = proba[:, pos_index]
    else:
        print("‚Ä¢ –¢–µ—Å—Ç–æ–≤–∏—Ö –ø—Ä–∏–∫–ª–∞–¥—ñ–≤ –Ω–µ–º–∞—î ‚Äî –ø—Ä–æ–≥–Ω–æ–∑ –ø—Ä–æ–ø—É—â–µ–Ω–æ.")

# -----------------------
# 6) –ó–±–µ—Ä–µ–∂–µ–Ω–Ω—è —Ä–µ–∑—É–ª—å—Ç–∞—Ç—É
# -----------------------
out_df = df.drop(columns=["cleaned.body"])
out_path = Path(OUTPUT_CSV).resolve()
out_df.to_csv(out_path, index=False, encoding="utf-8")
print(f"\n‚úÖ –ì–æ—Ç–æ–≤–æ! –†–µ–∑—É–ª—å—Ç–∞—Ç –∑–±–µ—Ä–µ–∂–µ–Ω–æ —É {out_path}")


üìä Docs: 10 | Train: 5 | Test: 5 | Vocab: 33

üß† –ú–æ–¥–µ–ª—å –¥–ª—è —Ü—ñ–ª—ñ: THREAT_up


ValueError: –ù–µ—Å—É–º—ñ—Å–Ω—ñ —Ä–æ–∑–º—ñ—Ä–∏: X_train=5 vs y_train=6. –ü–µ—Ä–µ–≤—ñ—Ä—Ç–µ —Ñ–æ—Ä–º—É–≤–∞–Ω–Ω—è train_idx/test_idx.