# Various Fold strategies for the ride

Adding list of fold strategies to use for the comp

- version 8 lowercasing text by default , if need original cases the same can be handled in dataset class.
- version 9/10 , added fixed version of cpc texts as per https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/discussion/324928#1790476 . This will go in new column **context_text_fix** and file **./cpc_texts_fixed.pth**



In [None]:
!pip install -q iterative-stratification

In [None]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
import torch
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold, GroupKFold, KFold
OUTPUT_DIR=""

In [None]:
sp.__version__  

In [None]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
submission = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')
print(f"train.shape: {train.shape}")
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")
display(train.head())
display(test.head())
display(submission.head())

In [None]:
# ====================================================
# CPC Data
# ====================================================
def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir('../input/cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'../input/cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results


def get_cpc_texts_fix():
    """
    Fix as provided by Nicholas Broad 
    https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/discussion/324928#1790476
    """
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir('../input/cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {} 
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
            with open(f'../input/cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
                s = f.read()
            pattern = f'{cpc}\t\t.+'
            result = re.findall(pattern, s)
            pattern = "^"+pattern[:-2]
            cpc_result = re.sub(pattern, "", result[0])
            for context in [c for c in contexts if c[0] == cpc]:
                pattern = f'{context}\t\t.+'
                result = re.findall(pattern, s)
                pattern = "^"+pattern[:-2]
                results[context] = cpc_result + ". " + re.sub(pattern, "", result[0])
    return results

cpc_texts = get_cpc_texts()
# Fixed version of cpc texts 
cpc_texts_fix = get_cpc_texts_fix()

torch.save(cpc_texts, OUTPUT_DIR+"cpc_texts.pth")
torch.save(cpc_texts_fix, OUTPUT_DIR+"cpc_texts_fixed.pth")

train['context_text'] = train['context'].map(cpc_texts)
test['context_text'] = test['context'].map(cpc_texts) 
train['context_text_fix'] = train['context'].map(cpc_texts_fix)
test['context_text_fix'] = test['context'].map(cpc_texts_fix) 
display(train.head(2))
display(test.head(2))

# Applying lower for text .. logic can be handled in dataset also 
train['text'] = train['anchor'] + '[SEP]' + train['target'] + '[SEP]'  + train['context_text'].apply(str.lower)
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text'].apply(str.lower)

train['text_fix'] = train['anchor'] + '[SEP]' + train['target'] + '[SEP]'  + train['context_text_fix'].apply(str.lower)
test['text_fix'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text_fix'].apply(str.lower)
display(train.head(4))
display(test.head(4))

# Strategy 1 : MultilabelStratifiedKFold on anchor

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
def create_msrat_folds(n_fold=5,train=train,random_state=42):
    dfx = pd.get_dummies(train, columns=["score"]).groupby(["anchor"], as_index=False).sum()
    cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
    dfx = dfx[cols]

    mskf = MultilabelStratifiedKFold(n_splits=n_fold, shuffle=True, random_state=random_state)
    labels = [c for c in dfx.columns if c != "anchor"]
    dfx_labels = dfx[labels]
    dfx["fold"] = -1

    for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
        print(len(trn_), len(val_))
        dfx.loc[val_, "fold"] = fold

    train = train.merge(dfx[["anchor", "fold"]], on="anchor", how="left")
    train.to_csv(f"train_folds_mstrat_{n_fold}.csv", index=False) 
    display(train.groupby('fold').size())

In [None]:
create_msrat_folds(4)

In [None]:
create_msrat_folds(5)

# Strategy 0 : Simple based on nakama notebook, via StratifiedKFolds

In [None]:
# https://www.kaggle.com/code/abhishek/phrase-matching-folds
def create_folds_simple(num_fold=5,train=train,random_state=42):
    train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    Fold = StratifiedKFold(n_splits=num_fold, shuffle=True, random_state=random_state)
    for n, (train_index, val_index) in enumerate(Fold.split(train, train['score_map'])):
        train.loc[val_index, 'fold'] = int(n)
    train['fold'] = train['fold'].astype(int)
    train.to_csv(f"train_folds_{num_fold}.csv", index=False)
    display(train.groupby('fold').size())
    return train

In [None]:
create_folds_simple()
create_folds_simple(4)

# Strategy 2 with stratified folds with cpc code titles dataset

In [None]:
# https://www.kaggle.com/code/ksork6s4/uspppm-bert-for-patents-baseline-train#Preproc
train_df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
titles = pd.read_csv('../input/cpc-codes/titles.csv')
train_df = train_df.merge(titles, left_on='context', right_on='code')
# https://www.kaggle.com/code/abhishek/phrase-matching-folds
def create_folds(num_fold=5,train_df=train_df,random_state=42):
    train_df['fold'] = -1
    kf = StratifiedGroupKFold(n_splits=num_fold, shuffle=True, random_state=random_state)

    for f, (t_, v_) in enumerate(kf.split(X=train_df, y=train_df['anchor'], groups=train_df['anchor'])):
        train_df.loc[v_, 'fold'] = f

    train_df['fold'].hist()
    train_df['text'] = train_df['anchor'] + '[SEP]' + train_df['title'].apply(str.lower)
    train_df = train_df[['id','anchor', 'target', 'context', 'score', 'title', 'fold', 'text']]
    train_df.to_csv(f"train_folds_strat_{num_fold}.csv", index=False)
    return train_df

In [None]:
create_folds()

In [None]:
create_folds(4)

# Thats all Folks 🤗 for now