In [1]:
!pip install names ftfy

Collecting names
  Downloading names-0.3.0.tar.gz (789 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m789.1/789.1 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: names
  Building wheel for names (setup.py) ... [?25l[?25hdone
  Created wheel for names: filename=names-0.3.0-py3-none-any.whl size=803681 sha256=7292fdfc1722a2815db0ca1d01d68fb67bc40c98bc084648146456d4bc28fe74
  Stored in directory: /root/.cache/pip/wheels/c7/f0/8f/de9f15941cd988c39b82703fa04cb2d550ba5867f13c6da052
Successfully built names
Installing collected packages: names, ftfy
Successfully installed ftfy-6.3.1 names-0.3.0


In [2]:
# --- FIX 1: FORCE LEGACY KERAS (Run this first!) ---
!pip install -U tf-keras
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"  # This restores .load_model() support for folders

# --- NOW IMPORT TENSORFLOW ---
import json
import numpy as np
import tensorflow as tf
import transformers
import tqdm
import math
import names  # ensure pip install names
import ftfy
import sklearn.metrics

# ==========================================
# CONFIGURATION
# ==========================================

# Path to your trained model FOLDER
# Make sure this path is exactly correct from your 'Input' section
MODEL_PATH = "/kaggle/input/roberta22/tensorflow2/default/1/retrained_new_tf/model~model=roberta-large-mnli~lr=5e-06~bs=128~dropout=0.10"

# --- FIX 2: CORRECT MODEL TYPE FOR TOKENIZER ---
# You cannot put the full file path here. It must be the HuggingFace model name.
MODEL_TYPE = "roberta-large-mnli"

# Path to the dataset you want to filter
TO_FILTER_PATH = "/kaggle/input/abcdddd/unique_dataset (1).jsonl"

# Path to training results (Set to None if you don't have it)
MODEL_RESULTS_PATH = None 

# Batch size for prediction
BATCH_SIZE = 128
RECALLS = [.5, .6, .7, .8, .9]

# ==========================================
# HELPER CLASSES
# ==========================================

_RELATIONS = {
    'HinderedBy': 'can be hindered by',
    'xNeed': 'but before, PersonX needed',
    'xWant': 'as a result, PersonX wants',
    'xIntent': 'because PersonX wanted',
    'xReact': 'as a result, PersonX feels',
    'xAttr': 'so, PersonX is seen as',
    'xEffect': 'as a result, PersonX'
}

class TextIterator(tf.keras.utils.Sequence):
    def __init__(self, texts, tokenizer, batch_size=32, shuffle=False):
        self.texts = texts
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.tokenizer = tokenizer
        self._to_string = self._to_string_main

    def __len__(self):
        return math.ceil(len(self.texts) / self.batch_size)

    def _sep_pair_with_name(self, cur):
        p1 = names.get_first_name()
        p2 = p1
        while p2 == p1:
            p2 = names.get_first_name()
        cur = cur.replace('PersonX', p1)
        cur = cur.replace('PersonY', p2)
        cur = ftfy.fix_text(cur)
        cur = cur.split('**SEP**')
        return cur

    def _to_string_main(self, x):
        rel_text = _RELATIONS.get(x['relation'], x['relation']) 
        # Check for 'inference' vs 'tail' key
        tail_text = x.get('inference', x.get('tail', ''))
        cur = '{}**SEP**{} {}'.format(x['head'], rel_text, tail_text)
        return self._sep_pair_with_name(cur)

    def __getitem__(self, idx):
        batch = self.texts[idx * self.batch_size:(idx + 1) * self.batch_size]
        texts = [self._to_string(b) for b in batch]
        text_X = self.tokenizer(texts, return_tensors='np', padding=True)['input_ids']
        return text_X

# ==========================================
# MAIN PREDICTION LOGIC
# ==========================================

def main():
    np.random.seed(1)
    
    # 1. Load Thresholds
    mean_cutoffs, mean_precs = [], []
    if MODEL_RESULTS_PATH and os.path.exists(MODEL_RESULTS_PATH):
        print(f"Loading thresholds from {MODEL_RESULTS_PATH}...")
        with open(MODEL_RESULTS_PATH) as f:
            data = json.load(f)
            val_preds = data['val_preds']
            val_labels = data['val_labels']
            val_ps, val_rs, val_thresh = sklearn.metrics.precision_recall_curve(y_true=val_labels, probas_pred=val_preds)
            for r in RECALLS:
                idx = 0
                while idx < len(val_rs) and val_rs[idx] > r: idx += 1
                if idx < len(val_thresh):
                    mean_cutoffs.append(val_thresh[idx])
                    mean_precs.append(val_ps[idx])
                else:
                    mean_cutoffs.append(0.5)
                    mean_precs.append(0.0)
    else:
        print("Using default cutoff 0.5")
        mean_cutoffs = [0.5] * len(RECALLS)
        mean_precs = [0.0] * len(RECALLS)

    # 2. Load Data
    print(f"Loading data from {TO_FILTER_PATH}...")
    to_filter = []
    with open(TO_FILTER_PATH) as f:
        for line in tqdm.tqdm(f):
            try:
                c_jsonl = json.loads(line)
                c_jsonl['valid'] = -1 
                to_filter.append(c_jsonl)
            except: pass

    # 3. Load Model
    print(f"Loading tokenizer: {MODEL_TYPE}...")
    try:
        tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_TYPE)
    except Exception as e:
        print(f"Error loading tokenizer. Did you fix MODEL_TYPE? Error: {e}")
        return

    pred_iter = TextIterator(to_filter, tokenizer, batch_size=BATCH_SIZE)
    
    print(f"Loading model from {MODEL_PATH}...")
    # This load_model call will now work thanks to the legacy fix
    keras_model = tf.keras.models.load_model(MODEL_PATH)

    # 4. Predict
    print("Running prediction (5 passes)...")
    preds = []
    for idx in range(5):
        print(f"Pass {idx+1}/5...")
        p = keras_model.predict(pred_iter, verbose=1).flatten()
        p = p[:len(to_filter)] 
        preds.append(p)
    
    preds = np.mean(np.array(preds), axis=0)

    # 5. Save Results
    for idx, p in enumerate(preds):
        to_filter[idx]['p_valid_model'] = float(p)

    base_name = TO_FILTER_PATH.split('/')[-1].split('.')[0]
    
    # Save probabilistic file
    with open(f'/kaggle/working/{base_name}_with_prob_est.jsonl', 'w') as f:
        for d in to_filter:
            f.write(json.dumps(d) + '\n')

    # Save filtered file (Default > 0.5)
    if MODEL_RESULTS_PATH is None:
        valid_idxs = np.where(preds > 0.5)[0]
        fname = f'/kaggle/working/{base_name}_filtered_threshold_0.5.jsonl'
        print(f"Saving {len(valid_idxs)} valid items to {fname}")
        with open(fname, 'w') as f:
            for idx in valid_idxs:
                f.write(json.dumps(to_filter[idx]) + '\n')
    else:
        for recall, cutoff in zip(RECALLS, mean_cutoffs):
            valid_idxs = np.where(preds > cutoff)[0]
            fname = f'/kaggle/working/{base_name}_filtered_recall_{recall}.jsonl'
            with open(fname, 'w') as f:
                for idx in valid_idxs:
                    f.write(json.dumps(to_filter[idx]) + '\n')

if __name__ == '__main__':
    main()

Collecting tf-keras
  Downloading tf_keras-2.20.1-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow<2.21,>=2.20 (from tf-keras)
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting tensorboard~=2.20.0 (from tensorflow<2.21,>=2.20->tf-keras)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Downloading tf_keras-2.20.1-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (620.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m98.7 MB/s[0m eta [36m0:00:00[0m:00:

39144it [00:00, 320522.41it/s]

Loading tokenizer: roberta-large-mnli...





tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loading model from /kaggle/input/roberta22/tensorflow2/default/1/retrained_new_tf/model~model=roberta-large-mnli~lr=5e-06~bs=128~dropout=0.10...


I0000 00:00:1770124641.781480      55 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15511 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Running prediction (5 passes)...
Pass 1/5...
Pass 2/5...
Pass 3/5...
Pass 4/5...
Pass 5/5...
Saving 32131 valid items to /kaggle/working/unique_dataset (1)_filtered_threshold_0.5.jsonl
