In [1]:
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## Info

* When training on GB, the best model is SVM<br>
* When training on PAN CLEF, the best model is XGB<br>
* When training on COLING, the best model is Random Forest

## Ghostbuster data

In [2]:
gb_wp = pd.read_csv('/kaggle/input/coling-ghostbuster/ghostbuster_wp_with_features.csv')
gb_reuters = pd.read_csv('/kaggle/input/coling-ghostbuster/ghostbuster_reuter_data_with_features.csv')
gb_essay = pd.read_csv('/kaggle/input/coling-ghostbuster/ghostbuster_essay_data_with_features.csv')

## PAN CLEF data

In [3]:
pan_train = pd.read_csv('/kaggle/input/creating-additional-features-for-pan-clef/train.csv')
pan_test = pd.read_csv('/kaggle/input/creating-additional-features-for-pan-clef/test.csv')

## COLING 2025

In [4]:
coling_train = pd.read_csv('/kaggle/input/coling-ghostbuster/coling_train_with_features.csv').sample(n=23707,random_state=42)
coling_test = pd.read_csv('/kaggle/input/creating-with-new-features-coling/coling_test3000.csv').sample(n=3586,random_state=42)

## Models

In [5]:
# Define models to train
models = {
    'XGB': XGBClassifier(eval_metric='logloss', random_state=47),
    'Random Forest': RandomForestClassifier(random_state=47),
    'Logistic Regression': LogisticRegression(random_state=47),
    'SVM': SVC(probability=True,random_state=47),    
}

In [6]:
## Features to be used for test
FEATURES = ['character_count', 'word_count', 'paragraph_count',
       'stopword_count', 'unique_word_count', 'sentiment_subjectivity',
       'discourse_marker_count', 'sentence_complexity', 'punctuation_count',
       'sentence_length_difference', 'type_token_ratio', 'word_entropy',
       'flesch_reading_ease', 'gzip_ratio', 'question_stmt_ratio',
       'clause_sentence_ratio', 'modal_freq', 'pronoun_ratio', 'pos_diversity',
       'hapax_ratio', 'sentence_length_variation', 'repetition_rate',
       'specificity_score', 'figurative_language_score',
       'paragraph_coherence_consistency', 'transition_variety_score',
       'grammatical_mistakes', 'pos_2gram_variety', 'pos_3gram_variety',
       'pos_4gram_variety']

# FEATURES = ['pos_diversity','sentence_length_difference','sentence_complexity','character_count',
#            'gzip_ratio']
TARGET = 'label'

## Function to get OOF preds

In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import numpy as np
from sklearn.base import clone

def train_and_get_preds(model, train, test):
    # For binary classification, we want probabilities for both classes
    # Shape: (n_samples, 2) for probabilities of both classes
    test_preds = np.zeros((test.shape[0], 2))
    
    # Initialize k-fold
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    
    # Assuming train contains both features and target
    # Separate features and target
    X_train = train.drop('label', axis=1)
    y_train = train['label']
    
    # Initialize array for OOF predictions
    oof_preds = np.zeros((train.shape[0], 2))
    
    # Store F1 scores for each fold
    fold_f1_scores = []
    
    # K-fold training and prediction
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
        print(f'Fold {fold+1}')
        
        # Split data
        X_tr = X_train.iloc[train_idx]
        y_tr = y_train.iloc[train_idx]
        X_val = X_train.iloc[val_idx]
        y_val = y_train.iloc[val_idx]
        
        # Clone model to avoid contamination between folds
        fold_model = clone(model)
        
        # Train model
        fold_model.fit(X_tr, y_tr)
        
        # Get test predictions (probabilities for both classes)
        fold_test_preds = fold_model.predict_proba(test)
        test_preds += fold_test_preds / kf.n_splits  # Average across folds
        
        # Get OOF predictions for validation
        fold_oof_preds = fold_model.predict_proba(X_val)
        oof_preds[val_idx] = fold_oof_preds
        
        # Calculate F1 score for this fold
        # Convert probabilities to class predictions
        fold_val_preds = np.argmax(fold_oof_preds, axis=1)
        fold_f1 = f1_score(y_val, fold_val_preds)
        fold_f1_scores.append(fold_f1)
        
        # Print fold F1 score
        print(f'  Fold {fold+1} OOF F1 Score: {fold_f1:.4f}')
    
    # Calculate and print overall OOF metrics
    print('\n' + '='*50)
    print('Cross-Validation Results:')
    print('='*50)
    
    # Convert all OOF probabilities to class predictions
    all_oof_preds = np.argmax(oof_preds, axis=1)
    overall_f1 = f1_score(y_train, all_oof_preds)
    
    # Print individual fold F1 scores
    for fold, score in enumerate(fold_f1_scores):
        print(f'Fold {fold+1}: F1 = {score:.4f}')
    
    # Print mean and std of fold F1 scores
    mean_f1 = np.mean(fold_f1_scores)
    std_f1 = np.std(fold_f1_scores)
    print(f'\nMean F1: {mean_f1:.4f} (+/- {std_f1:.4f})')
    print(f'Overall OOF F1: {overall_f1:.4f}')
    print('='*50)
    
    return test_preds, oof_preds

## Training on COLING

In [8]:
gb_wp_test_preds_training_COLING,gb_wp_oof_preds_training_COLING = train_and_get_preds(models['Random Forest'],
                                                coling_train[FEATURES+[TARGET]],
                                                gb_wp[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.8688
Fold 2
  Fold 2 OOF F1 Score: 0.8685
Fold 3
  Fold 3 OOF F1 Score: 0.8750
Fold 4
  Fold 4 OOF F1 Score: 0.8638
Fold 5
  Fold 5 OOF F1 Score: 0.8703

Cross-Validation Results:
Fold 1: F1 = 0.8688
Fold 2: F1 = 0.8685
Fold 3: F1 = 0.8750
Fold 4: F1 = 0.8638
Fold 5: F1 = 0.8703

Mean F1: 0.8693 (+/- 0.0036)
Overall OOF F1: 0.8693


In [9]:
gb_reuters_test_preds_training_COLING,gb_reuters_oof_preds_training_COLING = train_and_get_preds(models['Random Forest'],
                                                coling_train[FEATURES+[TARGET]],
                                                gb_reuters[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.8688
Fold 2
  Fold 2 OOF F1 Score: 0.8685
Fold 3
  Fold 3 OOF F1 Score: 0.8750
Fold 4
  Fold 4 OOF F1 Score: 0.8638
Fold 5
  Fold 5 OOF F1 Score: 0.8703

Cross-Validation Results:
Fold 1: F1 = 0.8688
Fold 2: F1 = 0.8685
Fold 3: F1 = 0.8750
Fold 4: F1 = 0.8638
Fold 5: F1 = 0.8703

Mean F1: 0.8693 (+/- 0.0036)
Overall OOF F1: 0.8693


In [10]:
gb_essay_test_preds_training_COLING, gb_essay_oof_preds_training_COLING = train_and_get_preds(models['Random Forest'],
                                                coling_train[FEATURES+[TARGET]],
                                                gb_essay[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.8688
Fold 2
  Fold 2 OOF F1 Score: 0.8685
Fold 3
  Fold 3 OOF F1 Score: 0.8750
Fold 4
  Fold 4 OOF F1 Score: 0.8638
Fold 5
  Fold 5 OOF F1 Score: 0.8703

Cross-Validation Results:
Fold 1: F1 = 0.8688
Fold 2: F1 = 0.8685
Fold 3: F1 = 0.8750
Fold 4: F1 = 0.8638
Fold 5: F1 = 0.8703

Mean F1: 0.8693 (+/- 0.0036)
Overall OOF F1: 0.8693


## Training On PAN CLEF

In [11]:
gb_wp_test_preds_training_PAN_CLEF,gb_wp_oof_preds_training_PAN_CLEF = train_and_get_preds(models['XGB'],
                                                pan_train[FEATURES+[TARGET]],
                                                gb_wp[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.9782
Fold 2
  Fold 2 OOF F1 Score: 0.9762
Fold 3
  Fold 3 OOF F1 Score: 0.9789
Fold 4
  Fold 4 OOF F1 Score: 0.9740
Fold 5
  Fold 5 OOF F1 Score: 0.9781

Cross-Validation Results:
Fold 1: F1 = 0.9782
Fold 2: F1 = 0.9762
Fold 3: F1 = 0.9789
Fold 4: F1 = 0.9740
Fold 5: F1 = 0.9781

Mean F1: 0.9771 (+/- 0.0018)
Overall OOF F1: 0.9771


In [12]:
gb_reuters_test_preds_training_PAN_CLEF,gb_reuters_oof_preds_training_PAN_CLEF = train_and_get_preds(models['XGB'],
                                                pan_train[FEATURES+[TARGET]],
                                                gb_reuters[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.9782
Fold 2
  Fold 2 OOF F1 Score: 0.9762
Fold 3
  Fold 3 OOF F1 Score: 0.9789
Fold 4
  Fold 4 OOF F1 Score: 0.9740
Fold 5
  Fold 5 OOF F1 Score: 0.9781

Cross-Validation Results:
Fold 1: F1 = 0.9782
Fold 2: F1 = 0.9762
Fold 3: F1 = 0.9789
Fold 4: F1 = 0.9740
Fold 5: F1 = 0.9781

Mean F1: 0.9771 (+/- 0.0018)
Overall OOF F1: 0.9771


In [13]:
gb_essay_test_preds_training_PAN_CLEF,gb_essay_oof_preds_training_PAN_CLEF = train_and_get_preds(models['XGB'],
                                                pan_train[FEATURES+[TARGET]],
                                                gb_essay[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.9782
Fold 2
  Fold 2 OOF F1 Score: 0.9762
Fold 3
  Fold 3 OOF F1 Score: 0.9789
Fold 4
  Fold 4 OOF F1 Score: 0.9740
Fold 5
  Fold 5 OOF F1 Score: 0.9781

Cross-Validation Results:
Fold 1: F1 = 0.9782
Fold 2: F1 = 0.9762
Fold 3: F1 = 0.9789
Fold 4: F1 = 0.9740
Fold 5: F1 = 0.9781

Mean F1: 0.9771 (+/- 0.0018)
Overall OOF F1: 0.9771


## pan train

In [14]:
pan_test_preds_training_PAN_CLEF,pan_oof_preds_training_PAN_CLEF = train_and_get_preds(models['XGB'],
                                                pan_train[FEATURES+[TARGET]],
                                                pan_test[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.9782
Fold 2
  Fold 2 OOF F1 Score: 0.9762
Fold 3
  Fold 3 OOF F1 Score: 0.9789
Fold 4
  Fold 4 OOF F1 Score: 0.9740
Fold 5
  Fold 5 OOF F1 Score: 0.9781

Cross-Validation Results:
Fold 1: F1 = 0.9782
Fold 2: F1 = 0.9762
Fold 3: F1 = 0.9789
Fold 4: F1 = 0.9740
Fold 5: F1 = 0.9781

Mean F1: 0.9771 (+/- 0.0018)
Overall OOF F1: 0.9771


In [15]:
coling_test_preds_training_PAN_CLEF,pan_oof_preds_training_PAN_CLEF = train_and_get_preds(models['XGB'],
                                                pan_train[FEATURES+[TARGET]],
                                                coling_test[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.9782
Fold 2
  Fold 2 OOF F1 Score: 0.9762
Fold 3
  Fold 3 OOF F1 Score: 0.9789
Fold 4
  Fold 4 OOF F1 Score: 0.9740
Fold 5
  Fold 5 OOF F1 Score: 0.9781

Cross-Validation Results:
Fold 1: F1 = 0.9782
Fold 2: F1 = 0.9762
Fold 3: F1 = 0.9789
Fold 4: F1 = 0.9740
Fold 5: F1 = 0.9781

Mean F1: 0.9771 (+/- 0.0018)
Overall OOF F1: 0.9771


## coling train

In [16]:
coling_test_preds_training_coling,coling_oof_preds_training_coling = train_and_get_preds(models['Random Forest'],
                                                coling_train[FEATURES+[TARGET]],
                                                coling_test[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.8688
Fold 2
  Fold 2 OOF F1 Score: 0.8685
Fold 3
  Fold 3 OOF F1 Score: 0.8750
Fold 4
  Fold 4 OOF F1 Score: 0.8638
Fold 5
  Fold 5 OOF F1 Score: 0.8703

Cross-Validation Results:
Fold 1: F1 = 0.8688
Fold 2: F1 = 0.8685
Fold 3: F1 = 0.8750
Fold 4: F1 = 0.8638
Fold 5: F1 = 0.8703

Mean F1: 0.8693 (+/- 0.0036)
Overall OOF F1: 0.8693


In [17]:
pan_test_preds_training_coling,pan_oof_preds_training_coling = train_and_get_preds(models['Random Forest'],
                                                coling_train[FEATURES+[TARGET]],
                                                pan_test[FEATURES])

Fold 1
  Fold 1 OOF F1 Score: 0.8688
Fold 2
  Fold 2 OOF F1 Score: 0.8685
Fold 3
  Fold 3 OOF F1 Score: 0.8750
Fold 4
  Fold 4 OOF F1 Score: 0.8638
Fold 5
  Fold 5 OOF F1 Score: 0.8703

Cross-Validation Results:
Fold 1: F1 = 0.8688
Fold 2: F1 = 0.8685
Fold 3: F1 = 0.8750
Fold 4: F1 = 0.8638
Fold 5: F1 = 0.8703

Mean F1: 0.8693 (+/- 0.0036)
Overall OOF F1: 0.8693


In [18]:
pan_test_probs = pd.concat([pd.DataFrame(data=pan_test_preds_training_coling,columns=["prob0_wp_coling","prob1_wp_coling"]),
          pd.DataFrame(data=pan_test_preds_training_PAN_CLEF,columns=["prob0_wp_PAN","prob1_wp_PAN"])],axis=1)

In [19]:
coling_test_probs = pd.concat([pd.DataFrame(data=coling_test_preds_training_coling,columns=["prob0_wp_coling","prob1_wp_coling"]),
          pd.DataFrame(data=coling_test_preds_training_PAN_CLEF,columns=["prob0_wp_PAN","prob1_wp_PAN"])],axis=1)

## Weighing Average using Ridge

In [20]:
# get_weighted_score(gb_wp_test_probs,gb_wp['label'])

In [21]:
# get_weighted_score(gb_reuters_test_probs,gb_reuters['label'])

In [22]:
# get_weighted_score(gb_essay_test_probs,gb_essay['label'])

In [23]:
gb_wp_test_probs = pd.concat([pd.DataFrame(data=gb_wp_test_preds_training_COLING,columns=["prob0_wp_coling","prob1_wp_coling"]),
          pd.DataFrame(data=gb_wp_test_preds_training_PAN_CLEF,columns=["prob0_wp_PAN","prob1_wp_PAN"])],axis=1)


gb_wp_oof_probs = pd.concat([pd.DataFrame(data=gb_wp_oof_preds_training_COLING,columns=["prob0_wp_coling","prob1_wp_coling"]),
          pd.DataFrame(data=gb_wp_oof_preds_training_PAN_CLEF,columns=["prob0_wp_PAN","prob1_wp_PAN"])],axis=1)


In [24]:
gb_reuters_test_probs = pd.concat([pd.DataFrame(data=gb_reuters_test_preds_training_COLING,columns=["prob0_reuters_coling","prob1_reuters_coling"]),
          pd.DataFrame(data=gb_reuters_test_preds_training_PAN_CLEF,columns=["prob0_reuters_PAN","prob1_reuters_PAN"])],axis=1)

gb_reuters_oof_probs = pd.concat([pd.DataFrame(data=gb_reuters_oof_preds_training_COLING,columns=["prob0_reuters_coling","prob1_reuters_coling"]),
          pd.DataFrame(data=gb_reuters_oof_preds_training_PAN_CLEF,columns=["prob0_reuters_PAN","prob1_reuters_PAN"])],axis=1)


In [25]:
gb_essay_test_probs = pd.concat([pd.DataFrame(data=gb_essay_test_preds_training_COLING,columns=["prob0_essay_coling","prob1_essay_coling"]),
          pd.DataFrame(data=gb_essay_test_preds_training_PAN_CLEF,columns=["prob0_essay_PAN","prob1_essay_PAN"])],axis=1)

gb_essay_oof_probs = pd.concat([pd.DataFrame(data=gb_essay_oof_preds_training_COLING,columns=["prob0_essay_coling","prob1_essay_coling"]),
          pd.DataFrame(data=gb_essay_oof_preds_training_PAN_CLEF,columns=["prob0_essay_PAN","prob1_essay_PAN"])],axis=1)


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

def get_weighted_score(data,target):
    X_train,X_test,y_train,y_test = train_test_split(data,target,test_size=0.8,random_state=42)

    ridge = Ridge()
    ridge.fit(X_train,y_train)
    test_probs = ridge.predict(X_test)

    metric_f1 = f1_score(y_test,np.where(test_probs>0.5,1,0))
    print(f"The f1 score is {metric_f1}.")

In [27]:
get_weighted_score(gb_wp_test_probs,gb_wp['label'])
get_weighted_score(gb_reuters_test_probs,gb_reuters['label'])
get_weighted_score(gb_essay_test_probs,gb_essay['label'])

The f1 score is 0.931820078444463.
The f1 score is 1.0.
The f1 score is 0.9366683510190331.


## score on pan and coling

In [28]:
get_weighted_score(pan_test_probs,pan_test['label'])

The f1 score is 0.9756888168557536.


In [29]:
get_weighted_score(coling_test_probs,coling_test['label'])

The f1 score is 0.8240043057050592.
