## Installing python language tool

In [1]:
!pip install -q /kaggle/input/language-tool-python-2-7-1/language_tool_python-2.7.1-py3-none-any.whl

In [2]:
# %% Directory settings

# ====================================================
# Directory settings
# ====================================================
from pathlib import Path
import re

INPUT_DIR = Path("../input/")

In [3]:
import os
import zipfile
from zipfile import ZipFile
import shutil

# create download path
def get_language_tool_cache_path():
    # Get download path from environment or use default.
    download_path = os.environ.get(
        'LTP_PATH',
        os.path.join(os.path.expanduser("~"), ".cache", "language_tool_python")
    )
    # Make download path, if it doesn't exist.
    os.makedirs(download_path, exist_ok=True)
    return download_path

lt_path = get_language_tool_cache_path()
lt_path

'/root/.cache/language_tool_python'

In [4]:
#cant move files directly from input to cache, so we zip it to output and unzip again


def get_all_file_paths(directory):
  
    # initializing empty file paths list
    file_paths = []
  
    # crawling through directory and subdirectories
    for root, directories, files in os.walk(directory):
        for filename in files:
            # join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)
  
    # returning all file paths
    return file_paths        
  
def main():
    # path to folder which needs to be zipped
    directory = '../input/language-tool-python-2-7-1/LanguageTool-5.7/LanguageTool-5.7'
  
    # calling function to get all file paths in the directory
    file_paths = get_all_file_paths(directory)

    # writing files to a zipfile
    with ZipFile('./lt.zip','w') as zip:
        # writing each file one by one
        for file in file_paths:
            zip.write(file)
  
    print('All files zipped successfully!')        
    
main()


 
zip_file = "./lt.zip"
 
try:
    with zipfile.ZipFile(zip_file) as z:
        z.extractall()
        print("Extracted all")
except:
    print("Invalid file")
    
#move to cache
!mv {'./input/language-tool-python-2-7-1/LanguageTool-5.7/LanguageTool-5.7'} {lt_path} 
print(os.listdir('/root/.cache/language_tool_python/'))

#remove files from output

shutil.rmtree('./input')
os.remove("./lt.zip")

All files zipped successfully!
Extracted all
['LanguageTool-5.7']


In [5]:
!pip install textstat -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h

In [6]:
import pandas as pd

df = pd.read_csv('/kaggle/input/clean-samples-from-new-llms-300/new_data.csv')

In [7]:
import polars as pl
import re
import string
from collections import Counter
from textblob import TextBlob
import textstat
import spacy
from scipy.stats import entropy
import gzip
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from textblob import TextBlob
import numpy as np
# from nltk.corpus import stopwords

# Load data
# train_df = pl.read_csv('/kaggle/input/pan-cief/train.csv')
# test_df = pl.read_csv('/kaggle/input/pan-cief/val.csv')

# Initialize spacy and stopwords
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

# Define all feature functions
def vocabSize(sentence):
    doc = nlp(sentence.lower())
    tokens = set([token.text for token in doc if not token.is_punct])
    return len(tokens)

def sentence_complexity(sentence):
    flesch_score = textstat.flesch_reading_ease(sentence)
    fk_grade_level = textstat.flesch_kincaid_grade(sentence)
    gunning_fog = textstat.gunning_fog(sentence)
    smog_index = textstat.smog_index(sentence)
    composite_score = (flesch_score * 0.2 + fk_grade_level * 0.3 + 
                      gunning_fog * 0.3 + smog_index * 0.2)
    return composite_score

def punctuation_count(paragraph):
    return sum(1 for char in paragraph if char in string.punctuation)

def sentence_length_difference(paragraph):
    sentences = re.split(r'[.!?]', paragraph)
    sentences = [s.strip() for s in sentences if s.strip()]
    if not sentences:
        return 0
    sentence_lengths = [len(s.split()) for s in sentences]
    return max(sentence_lengths) - min(sentence_lengths)

def type_token_ratio(text):
    words = text.split()
    if len(words) == 0:
        return 0
    unique_words = set(words)
    return len(unique_words) / len(words)

def pos_counts(text):
    doc = nlp(text)
    pos_count_dict = {}
    for token in doc:
        pos = token.pos_
        pos_count_dict[pos] = pos_count_dict.get(pos, 0) + 1
    return pos_count_dict

def count_discourse_markers(text):
    discourse_markers = ["however", "therefore", "although", "nevertheless", "hence"]
    return sum(text.lower().count(marker) for marker in discourse_markers)

def word_entropy(text):
    doc = nlp(text)
    words = [t.lemma_.lower() for t in doc if t.is_alpha]
    if not words:
        return 0
    freqs = list(Counter(words).values())
    return entropy(freqs)

def flesch_reading_ease(text):
    try:
        return textstat.flesch_reading_ease(text)
    except:
        return 0

def gzip_ratio(text):
    if len(text) == 0:
        return 0
    compressed = len(gzip.compress(text.encode('utf-8')))
    return compressed / len(text)

def negation_frequency(text):
    doc = nlp(text)
    negations = [t for t in doc if t.dep_ == "neg" or 
                 t.lemma_.lower() in ["not", "no", "never", "none", "n't"]]
    total_words = len([t for t in doc if t.is_alpha])
    return len(negations) / (total_words + 1e-5)

def question_statement_ratio(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    if not sentences:
        return 0
    question_count = sum(1 for s in sentences if s.text.strip().endswith("?"))
    statement_count = sum(1 for s in sentences if s.text.strip().endswith("."))
    return question_count / (statement_count + 1e-5)

def clause_to_sentence_ratio(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    if not sentences:
        return 0
    clause_markers = ("mark", "advcl", "ccomp", "xcomp", "acl", "relcl", "conj")
    clause_count = sum(1 for t in doc if t.dep_ in clause_markers)
    return clause_count / len(sentences)

def modal_verb_frequency(text):
    doc = nlp(text)
    modals = {"can", "could", "may", "might", "shall", "should", "will", "would", "must"}
    modal_count = sum(1 for t in doc if t.lemma_.lower() in modals)
    total_words = len([t for t in doc if t.is_alpha])
    return modal_count / (total_words + 1e-5)

def pronoun_ratio(text):
    doc = nlp(text)
    pronouns = [t for t in doc if t.pos_ == "PRON"]
    total_words = len([t for t in doc if t.is_alpha])
    return len(pronouns) / (total_words + 1e-5)

def pos_diversity(text):
    doc = nlp(text)
    pos_tags = [t.pos_ for t in doc if t.is_alpha]
    if not pos_tags:
        return 0
    counts = Counter(pos_tags)
    return entropy(list(counts.values()))

def hapax_legomena_ratio(text):
    doc = nlp(text)
    words = [t.lemma_.lower() for t in doc if t.is_alpha]
    if not words:
        return 0
    freqs = Counter(words)
    hapax = sum(1 for w, f in freqs.items() if f == 1)
    return hapax / len(freqs)

def get_sentiment_polarity(text):
    return TextBlob(text).sentiment.polarity

def get_sentiment_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def count_stopwords(text):
    return len([word for word in text.split() if word.lower() in stop_words])


In [8]:
from nltk.tokenize import sent_tokenize
from collections import Counter
import spacy
import re
from textblob import Word
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import math

nlp = spacy.load("en_core_web_sm")

def sentence_length_variation(text):
    sentences = sent_tokenize(text)
    lengths = [len(s.split()) for s in sentences if len(s.split()) > 0]

    if len(lengths) < 2:
        return 0.0  

    return np.std(lengths)   # Standard deviation

# def vocabulary_diversity(text): ## same as type token ratio
#     words = [w.lower() for w in text.split() if w.isalpha()]
#     if len(words) == 0:
#         return 0
#     return len(set(words)) / len(words)

def repetition_rate(text):
    words = [w.lower() for w in text.split()]
    bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
    if len(bigrams) == 0:
        return 0

    counts = Counter(bigrams)
    repeated = sum(1 for bg, c in counts.items() if c > 1)

    return repeated / len(bigrams)

def personal_voice_score(text):
    personal_pronouns = {"i", "me", "my", "mine", "we", "us", "our", "ours"}
    words = [w.lower() for w in text.split()]
    count = sum(1 for w in words if w in personal_pronouns)
    if len(words) == 0:
        return 0
    return count / len(words)

def emotion_variation(text):
    sentences = sent_tokenize(text)
    if len(sentences) < 2:
        return 0

    sentiments = [TextBlob(s).sentiment.polarity for s in sentences]
    diffs = [abs(sentiments[i] - sentiments[i+1]) for i in range(len(sentiments)-1)]

    return np.mean(diffs)


def specificity_score(text):
    doc = nlp(text)
    concrete_tags = {"NOUN", "PROPN", "NUM"}  
    concrete_count = sum(1 for token in doc if token.pos_ in concrete_tags)
    if len(doc) == 0:
        return 0
    return concrete_count / len(doc)


def imperfection_score(text):
    words = [w for w in re.findall(r"\b\w+\b", text)]
    if len(words) == 0:
        return 0

    misspelled = sum(1 for w in words if Word(w).correct().lower() != w.lower())
    return misspelled / len(words)

figurative_markers = [
    "like", "as if", "as though", "metaphor", "symbolic", 
    "resembles", "reminds me of", "figurative"
]

def figurative_language_score(text):
    t = text.lower()
    count = sum(t.count(m) for m in figurative_markers)
    return count

def paragraph_coherence_consistency(text):
    paragraphs = [p.strip() for p in text.split("\n") if len(p.strip()) > 0]

    if len(paragraphs) < 2:
        return 0

    vec = TfidfVectorizer().fit_transform(paragraphs)
    sims = []

    for i in range(len(paragraphs)-1):
        sim = cosine_similarity(vec[i], vec[i+1])[0][0]
        sims.append(sim)

    return np.mean(sims)


def predictability_score(text):
    words = [w.lower() for w in text.split()]
    counts = Counter(words)
    total = len(words)
    if total == 0:
        return 0

    probs = [counts[w]/total for w in words]
    surprise = [-math.log(p) for p in probs]

    return np.mean(surprise)

hedge_words = {
    "maybe", "perhaps", "sort of", "kind of", "i guess", 
    "probably", "possibly", "apparently", "roughly"
}

def hedge_uncertainty_score(text):
    t = text.lower()
    count = sum(t.count(hw) for hw in hedge_words)
    return count

transitions = [
    "however", "therefore", "meanwhile", "moreover", "furthermore",
    "in contrast", "on the other hand", "overall", "in summary"
]

def transition_variety_score(text):
    t = text.lower()
    count = sum(t.count(word) for word in transitions)
    return count

In [9]:
import language_tool_python

tool = language_tool_python.LanguageTool('en-US')
# nlp = spacy.load("en_core_web_sm")

In [10]:
def grammatical_mistakes(sentence):

  mistakes = len(tool.check(sentence))

  return mistakes

In [11]:
def pos_tag_ngrams(text, n=2):
    doc = nlp(text)
    tags = [token.pos_ for token in doc if token.is_alpha]

    if len(tags) < n:
        return {}

    ngrams = zip(*[tags[i:] for i in range(n)])
    return Counter(ngrams)

def pos_ngram_variety(text, n=2):
    ngrams = pos_tag_ngrams(text, n)
    return len(ngrams)

In [12]:
import pandas as pd
from textblob import TextBlob

def get_features_from_text(text: str):
    stop_words = set(stopwords.words('english'))
    
    features = {}
    features['character_count'] = len(text)
    features['word_count'] = len(text.split())
    features['sentence_count']=len(sent_tokenize(text))
    features['paragraph_count'] = len(text.split("\n"))
    features['stopword_count'] = len([word for word in text.split() if word.lower() in stop_words])
    features['unique_word_count'] = len(set(text.split()))
    features['sentiment_polarity'] = TextBlob(text).sentiment.polarity
    features['sentiment_subjectivity'] = TextBlob(text).sentiment.subjectivity
    features['discourse_marker_count'] = count_discourse_markers(text)
    features['vocab_size'] = vocabSize(text)
    features['sentence_complexity'] = sentence_complexity(text)
    features['punctuation_count'] = punctuation_count(text)
    features['sentence_length_difference'] = sentence_length_difference(text)
    features['type_token_ratio'] = type_token_ratio(text)
    features['word_entropy'] = word_entropy(text)
    features['flesch_reading_ease'] = flesch_reading_ease(text)
    features['gzip_ratio'] = gzip_ratio(text)
    features['negation_freq'] = negation_frequency(text)
    features['question_stmt_ratio'] = question_statement_ratio(text)
    features['clause_sentence_ratio'] = clause_to_sentence_ratio(text)
    features['modal_freq'] = modal_verb_frequency(text)
    features['pronoun_ratio'] = pronoun_ratio(text)
    features['pos_diversity'] = pos_diversity(text)
    features['hapax_ratio'] = hapax_legomena_ratio(text)
    features['sentence_length_variation'] = sentence_length_variation(text)
    features['repetition_rate'] = repetition_rate(text)
    features['personal_voice_score'] = personal_voice_score(text)
    features['emotion_variation'] = emotion_variation(text)
    features['specificity_score'] = specificity_score(text)
    features['figurative_language_score'] = figurative_language_score(text)
    features['paragraph_coherence_consistency'] = paragraph_coherence_consistency(text)
    features['predictability_score'] = predictability_score(text)
    features['hedge_uncertainty_score'] = hedge_uncertainty_score(text)
    features['transition_variety_score'] = transition_variety_score(text)
    features['grammatical_mistakes'] = grammatical_mistakes(text)
    features['pos_2gram_variety'] = pos_ngram_variety(text)
    features['pos_3gram_variety'] = pos_ngram_variety(text,n=3)
    features['pos_4gram_variety'] = pos_ngram_variety(text,n=4)
    # features['perplexity'] = perplexity(text)
    
    return pd.Series(features)

# Apply the function to create new columns
new_features = df['text'].apply(get_features_from_text)
df = pd.concat([df, new_features], axis=1)

In [13]:
FEATURES = ['character_count', 'word_count', 'paragraph_count',
       'stopword_count', 'unique_word_count', 'sentiment_subjectivity',
       'discourse_marker_count', 'sentence_complexity', 'punctuation_count',
       'sentence_length_difference', 'type_token_ratio', 'word_entropy',
       'flesch_reading_ease', 'gzip_ratio', 'question_stmt_ratio',
       'clause_sentence_ratio', 'modal_freq', 'pronoun_ratio', 'pos_diversity',
       'hapax_ratio', 'sentence_length_variation', 'repetition_rate',
       'specificity_score', 'figurative_language_score',
       'paragraph_coherence_consistency', 'transition_variety_score',
       'grammatical_mistakes', 'pos_2gram_variety', 'pos_3gram_variety',
       'pos_4gram_variety']

TARGET = 'label'

In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import numpy as np
from sklearn.base import clone
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

models = {
    'XGB': XGBClassifier(eval_metric='logloss', random_state=47),
    'Random Forest': RandomForestClassifier(random_state=47),
    'Logistic Regression': LogisticRegression(random_state=47),
    'SVM': SVC(probability=True,random_state=47),    
}

def train_and_get_preds(model, train, test):
    # For binary classification, we want probabilities for both classes
    # Shape: (n_samples, 2) for probabilities of both classes
    test_preds = np.zeros((test.shape[0], 2))
    
    # Initialize k-fold
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    
    # Assuming train contains both features and target
    # Separate features and target
    X_train = train.drop('label', axis=1)
    y_train = train['label']
    
    # Initialize array for OOF predictions
    oof_preds = np.zeros((train.shape[0], 2))
    
    # Store F1 scores for each fold
    fold_f1_scores = []
    
    # K-fold training and prediction
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
        print(f'Fold {fold+1}')
        
        # Split data
        X_tr = X_train.iloc[train_idx]
        y_tr = y_train.iloc[train_idx]
        X_val = X_train.iloc[val_idx]
        y_val = y_train.iloc[val_idx]
        
        # Clone model to avoid contamination between folds
        fold_model = clone(model)
        
        # Train model
        fold_model.fit(X_tr, y_tr)
        
        # Get test predictions (probabilities for both classes)
        fold_test_preds = fold_model.predict_proba(test)
        test_preds += fold_test_preds / kf.n_splits  # Average across folds
        
        # Get OOF predictions for validation
        fold_oof_preds = fold_model.predict_proba(X_val)
        oof_preds[val_idx] = fold_oof_preds
        
        # Calculate F1 score for this fold
        # Convert probabilities to class predictions
        fold_val_preds = np.argmax(fold_oof_preds, axis=1)
        fold_f1 = f1_score(y_val, fold_val_preds)
        fold_f1_scores.append(fold_f1)
        
        # Print fold F1 score
        print(f'  Fold {fold+1} OOF F1 Score: {fold_f1:.4f}')
    
    # Calculate and print overall OOF metrics
    print('\n' + '='*50)
    print('Cross-Validation Results:')
    print('='*50)
    
    # Convert all OOF probabilities to class predictions
    all_oof_preds = np.argmax(oof_preds, axis=1)
    overall_f1 = f1_score(y_train, all_oof_preds)
    
    # Print individual fold F1 scores
    for fold, score in enumerate(fold_f1_scores):
        print(f'Fold {fold+1}: F1 = {score:.4f}')
    
    # Print mean and std of fold F1 scores
    mean_f1 = np.mean(fold_f1_scores)
    std_f1 = np.std(fold_f1_scores)
    print(f'\nMean F1: {mean_f1:.4f} (+/- {std_f1:.4f})')
    print(f'Overall OOF F1: {overall_f1:.4f}')
    print('='*50)
    
    return test_preds, oof_preds

In [15]:
pan_train = pd.read_csv('/kaggle/input/creating-additional-features-for-pan-clef/train.csv')
pan_test = pd.read_csv('/kaggle/input/creating-additional-features-for-pan-clef/test.csv')

In [16]:
coling_train = pd.read_csv('/kaggle/input/coling-ghostbuster/coling_train_with_features.csv').sample(n=23707,random_state=42)
coling_test = pd.read_csv('/kaggle/input/creating-with-new-features-coling/coling_test3000.csv').sample(n=3586,random_state=42)

In [40]:
pan_test_preds_training_PAN_CLEF,_ = train_and_get_preds(models['XGB'],
                                                pan_train[FEATURES+[TARGET]],
                                                pan_test[FEATURES])

coling_test_preds_training_PAN_CLEF,_ = train_and_get_preds(models['Random Forest'],
                                                pan_train[FEATURES+[TARGET]],
                                                coling_test[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.9782
Fold 2
  Fold 2 OOF F1 Score: 0.9762
Fold 3
  Fold 3 OOF F1 Score: 0.9789
Fold 4
  Fold 4 OOF F1 Score: 0.9740
Fold 5
  Fold 5 OOF F1 Score: 0.9781

Cross-Validation Results:
Fold 1: F1 = 0.9782
Fold 2: F1 = 0.9762
Fold 3: F1 = 0.9789
Fold 4: F1 = 0.9740
Fold 5: F1 = 0.9781

Mean F1: 0.9771 (+/- 0.0018)
Overall OOF F1: 0.9771
Fold 1
  Fold 1 OOF F1 Score: 0.9653
Fold 2
  Fold 2 OOF F1 Score: 0.9645
Fold 3
  Fold 3 OOF F1 Score: 0.9649
Fold 4
  Fold 4 OOF F1 Score: 0.9606
Fold 5
  Fold 5 OOF F1 Score: 0.9666

Cross-Validation Results:
Fold 1: F1 = 0.9653
Fold 2: F1 = 0.9645
Fold 3: F1 = 0.9649
Fold 4: F1 = 0.9606
Fold 5: F1 = 0.9666

Mean F1: 0.9644 (+/- 0.0020)
Overall OOF F1: 0.9644


In [41]:
pan_test_preds_training_COLING, gb_essay_oof_preds_training_COLING = train_and_get_preds(models['Random Forest'],
                                                coling_train[FEATURES+[TARGET]],
                                                pan_test[FEATURES])

coling_test_preds_training_COLING, gb_essay_oof_preds_training_COLING = train_and_get_preds(models['Random Forest'],
                                                coling_train[FEATURES+[TARGET]],
                                                coling_test[FEATURES])


Fold 1
  Fold 1 OOF F1 Score: 0.8688
Fold 2
  Fold 2 OOF F1 Score: 0.8685
Fold 3
  Fold 3 OOF F1 Score: 0.8750
Fold 4
  Fold 4 OOF F1 Score: 0.8638
Fold 5
  Fold 5 OOF F1 Score: 0.8703

Cross-Validation Results:
Fold 1: F1 = 0.8688
Fold 2: F1 = 0.8685
Fold 3: F1 = 0.8750
Fold 4: F1 = 0.8638
Fold 5: F1 = 0.8703

Mean F1: 0.8693 (+/- 0.0036)
Overall OOF F1: 0.8693
Fold 1
  Fold 1 OOF F1 Score: 0.8688
Fold 2
  Fold 2 OOF F1 Score: 0.8685
Fold 3
  Fold 3 OOF F1 Score: 0.8750
Fold 4
  Fold 4 OOF F1 Score: 0.8638
Fold 5
  Fold 5 OOF F1 Score: 0.8703

Cross-Validation Results:
Fold 1: F1 = 0.8688
Fold 2: F1 = 0.8685
Fold 3: F1 = 0.8750
Fold 4: F1 = 0.8638
Fold 5: F1 = 0.8703

Mean F1: 0.8693 (+/- 0.0036)
Overall OOF F1: 0.8693


In [42]:
pan_test_probs = pd.concat([pd.DataFrame(data=pan_test_preds_training_COLING,columns=["prob0_COLING","prob1_COLING"]),
          pd.DataFrame(data=pan_test_preds_training_PAN_CLEF,columns=["prob0_PAN","prob1_PAN"])],axis=1)

In [43]:
coling_test_probs = pd.concat([pd.DataFrame(data=coling_test_preds_training_COLING,columns=["prob0_COLING","prob1_COLING"]),
          pd.DataFrame(data=coling_test_preds_training_PAN_CLEF,columns=["prob0_PAN","prob1_PAN"])],axis=1)

## Training on PAN and predicting on new samples

In [17]:
new_samples_test_preds_training_PAN,new_samples_oof_preds_training_PAN = train_and_get_preds(models['XGB'],
                                                pan_train[FEATURES+[TARGET]],
                                                df[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.9782
Fold 2
  Fold 2 OOF F1 Score: 0.9762
Fold 3
  Fold 3 OOF F1 Score: 0.9789
Fold 4
  Fold 4 OOF F1 Score: 0.9740
Fold 5
  Fold 5 OOF F1 Score: 0.9781

Cross-Validation Results:
Fold 1: F1 = 0.9782
Fold 2: F1 = 0.9762
Fold 3: F1 = 0.9789
Fold 4: F1 = 0.9740
Fold 5: F1 = 0.9781

Mean F1: 0.9771 (+/- 0.0018)
Overall OOF F1: 0.9771


## Training on COLING and predicting on new samples

In [18]:
new_samples_test_preds_training_COLING,new_samples_oof_preds_training_COLING = train_and_get_preds(models['XGB'],
                                                coling_train[FEATURES+[TARGET]],
                                                df[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.8677
Fold 2
  Fold 2 OOF F1 Score: 0.8710
Fold 3
  Fold 3 OOF F1 Score: 0.8765
Fold 4
  Fold 4 OOF F1 Score: 0.8624
Fold 5
  Fold 5 OOF F1 Score: 0.8737

Cross-Validation Results:
Fold 1: F1 = 0.8677
Fold 2: F1 = 0.8710
Fold 3: F1 = 0.8765
Fold 4: F1 = 0.8624
Fold 5: F1 = 0.8737

Mean F1: 0.8703 (+/- 0.0049)
Overall OOF F1: 0.8703


In [47]:
total_preds = pd.concat([pd.DataFrame(data=new_samples_test_preds_training_PAN,columns=["prob0_PAN","prob1_PAN"]),
                        pd.DataFrame(data=new_samples_test_preds_training_COLING,columns=["prob0_COLING","prob1_COLING"])],axis=1)

In [20]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

def calculate_comprehensive_metrics(predictions, true_labels):
    """
    Calculate comprehensive metrics for each model section
    
    Returns:
    dict: Dictionary containing F1, precision, recall, and accuracy for each model
    """
    
    if len(predictions) != 300 or len(true_labels) != 300:
        raise ValueError("Both predictions and true_labels must have exactly 300 elements")
    
    slices = {
        'Gemini': slice(0, 100),
        'DeepSeek 3.1': slice(100, 200),
        'GPT-5': slice(200, 300)
    }
    
    metrics = {}
    
    for model_name, slice_obj in slices.items():
        model_preds = predictions[slice_obj]
        model_true = true_labels[slice_obj]
        
        metrics[model_name] = {
            'f1': f1_score(model_true, model_preds, pos_label=1),
            'precision': precision_score(model_true, model_preds, pos_label=1),
            'recall': recall_score(model_true, model_preds, pos_label=1),
            'accuracy': accuracy_score(model_true, model_preds),
            'predicted_ones': np.sum(model_preds == 1),
            'actual_ones': np.sum(model_true == 1)
        }
    
    # Print results in a nice format
    print("COMPREHENSIVE MODEL PERFORMANCE")
    print("=" * 80)
    for model, scores in metrics.items():
        print(f"\n{model}:")
        print(f"  F1 Score:       {scores['f1']:.4f}")
        print(f"  Precision:      {scores['precision']:.4f}")
        print(f"  Recall:         {scores['recall']:.4f}")
        print(f"  Accuracy:       {scores['accuracy']:.4f}")
        print(f"  Predicted '1's: {scores['predicted_ones']}/100")
        print(f"  Actual '1's:    {scores['actual_ones']}/100")
    
    return metrics

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

def get_weighted_score(X_train,y_train,X_test,y_test):
    # X_train,X_test,y_train,y_test = train_test_split(data,target,test_size=0.8,random_state=42)

    ridge = Ridge()
    ridge.fit(X_train,y_train)
    test_probs = ridge.predict(X_test)

    metric_f1 = f1_score(y_test,np.where(test_probs>0.5,1,0))
    all_metrics = calculate_comprehensive_metrics(np.where(test_probs>0.5,1,0),y_test)
    print(f"The f1 score is {metric_f1}.")

In [57]:
pron_feats = ['prob0_COLING', 'prob1_COLING', 'prob0_PAN', 'prob1_PAN']

In [58]:
## weighted ensemble from PAN CLEF
get_weighted_score(pan_test_probs[pron_feats],pan_test['label'],total_preds[pron_feats],df['label'])

COMPREHENSIVE MODEL PERFORMANCE

Gemini:
  F1 Score:       0.9418
  Precision:      1.0000
  Recall:         0.8900
  Accuracy:       0.8900
  Predicted '1's: 89/100
  Actual '1's:    100/100

DeepSeek 3.1:
  F1 Score:       0.9189
  Precision:      1.0000
  Recall:         0.8500
  Accuracy:       0.8500
  Predicted '1's: 85/100
  Actual '1's:    100/100

GPT-5:
  F1 Score:       0.9247
  Precision:      1.0000
  Recall:         0.8600
  Accuracy:       0.8600
  Predicted '1's: 86/100
  Actual '1's:    100/100
The f1 score is 0.9285714285714286.


In [59]:
## weighted ensemble from PAN CLEF
get_weighted_score(coling_test_probs[pron_feats],coling_test['label'],total_preds[pron_feats],df['label'])

COMPREHENSIVE MODEL PERFORMANCE

Gemini:
  F1 Score:       0.9637
  Precision:      1.0000
  Recall:         0.9300
  Accuracy:       0.9300
  Predicted '1's: 93/100
  Actual '1's:    100/100

DeepSeek 3.1:
  F1 Score:       0.9848
  Precision:      1.0000
  Recall:         0.9700
  Accuracy:       0.9700
  Predicted '1's: 97/100
  Actual '1's:    100/100

GPT-5:
  F1 Score:       0.9744
  Precision:      1.0000
  Recall:         0.9500
  Accuracy:       0.9500
  Predicted '1's: 95/100
  Actual '1's:    100/100
The f1 score is 0.9743589743589743.


In [63]:
calculate_comprehensive_metrics(np.where(new_samples_test_preds_training_PAN[:,1]>0.5,1,0),df['label'])

COMPREHENSIVE MODEL PERFORMANCE

Gemini:
  F1 Score:       0.9418
  Precision:      1.0000
  Recall:         0.8900
  Accuracy:       0.8900
  Predicted '1's: 89/100
  Actual '1's:    100/100

DeepSeek 3.1:
  F1 Score:       0.9189
  Precision:      1.0000
  Recall:         0.8500
  Accuracy:       0.8500
  Predicted '1's: 85/100
  Actual '1's:    100/100

GPT-5:
  F1 Score:       0.9247
  Precision:      1.0000
  Recall:         0.8600
  Accuracy:       0.8600
  Predicted '1's: 86/100
  Actual '1's:    100/100


{'Gemini': {'f1': 0.9417989417989417,
  'precision': 1.0,
  'recall': 0.89,
  'accuracy': 0.89,
  'predicted_ones': 89,
  'actual_ones': 100},
 'DeepSeek 3.1': {'f1': 0.9189189189189189,
  'precision': 1.0,
  'recall': 0.85,
  'accuracy': 0.85,
  'predicted_ones': 85,
  'actual_ones': 100},
 'GPT-5': {'f1': 0.924731182795699,
  'precision': 1.0,
  'recall': 0.86,
  'accuracy': 0.86,
  'predicted_ones': 86,
  'actual_ones': 100}}

In [64]:
calculate_comprehensive_metrics(np.where(new_samples_test_preds_training_COLING[:,1]>0.5,1,0),df['label'])

COMPREHENSIVE MODEL PERFORMANCE

Gemini:
  F1 Score:       0.9796
  Precision:      1.0000
  Recall:         0.9600
  Accuracy:       0.9600
  Predicted '1's: 96/100
  Actual '1's:    100/100

DeepSeek 3.1:
  F1 Score:       0.9899
  Precision:      1.0000
  Recall:         0.9800
  Accuracy:       0.9800
  Predicted '1's: 98/100
  Actual '1's:    100/100

GPT-5:
  F1 Score:       0.9796
  Precision:      1.0000
  Recall:         0.9600
  Accuracy:       0.9600
  Predicted '1's: 96/100
  Actual '1's:    100/100


{'Gemini': {'f1': 0.9795918367346939,
  'precision': 1.0,
  'recall': 0.96,
  'accuracy': 0.96,
  'predicted_ones': 96,
  'actual_ones': 100},
 'DeepSeek 3.1': {'f1': 0.98989898989899,
  'precision': 1.0,
  'recall': 0.98,
  'accuracy': 0.98,
  'predicted_ones': 98,
  'actual_ones': 100},
 'GPT-5': {'f1': 0.9795918367346939,
  'precision': 1.0,
  'recall': 0.96,
  'accuracy': 0.96,
  'predicted_ones': 96,
  'actual_ones': 100}}