# Initialize

In [18]:
#Technical stuff, mounting to drive, supressing some non useful warnings, and positioning into a certain directory

from google.colab import drive
drive.mount("/content/drive")

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import os
%pwd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/My Drive/Stage_Wassim_2024/Git_mock/Datasets'

In [19]:
# Used libraries
# General
import numpy as np                              # math stuff
import pandas as pd                             # data manipulation
import matplotlib.pyplot as plt                 # plotting purposes(taking over the world)
import random                                   # random / seeds
from tqdm import tqdm                           # QOL looping
import json                                     # nested dictionaries, csv will mess up the data loader
import inspect
from typing import AnyStr, List                 #formatting stuff


# Classifying stuff
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer #TF_IDF
from sklearn.feature_extraction.text import CountVectorizer #Bag of words
from sklearn.linear_model import LogisticRegression #LR
from sklearn.metrics import precision_recall_fscore_support #All metrics
from sklearn.model_selection import KFold
#from bpemb import BPEmb #tokenize

In [20]:
# VARIABLES
seed =  1000
lr = 0.000001351
weight_decay = 0.1
warmup_steps = 300
batch_size = 4
n_epochs = 3

ff_dim = 256
n_heads = 1
n_layers = 1
dropout_prob = 0.1

In [21]:
#setting the seed

random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Functions

## Data related functions


In [22]:
LABELS = {
    'non-check-worthy': 0,
    'check-worthy': 1
}

In [23]:
def read_citation_detection_jsonl_single_line(jsonl_file: AnyStr):

    with open(jsonl_file) as f:
        data = [json.loads(l.strip()) for l in f]

    # Get sentences and labels
    dataset = [[s['text'], LABELS[s['label']]] for d in tqdm(data, desc="Processing data") for s in d['samples']]

    return pd.DataFrame(dataset, columns=['text', 'label'])

In [24]:
def load_data() :
  # checking if I have the data locally
  if os.path.isfile('train.json') and os.path.isfile('test.json') and os.path.isfile('val.json') :
    # we load the data directly
    cw_train = pd.read_json('train.json')
    cw_test = pd.read_json('test.json')
    cw_val = pd.read_json('val.json')
  # if we don't have the data locally
  else :
    # we fetch the dataset
    CiteWorth = load_dataset('copenlu/citeworth')
    # we put the subset into data frames
    cw_train = pd.DataFrame(CiteWorth['train'])
    cw_test = pd.DataFrame(CiteWorth['test'])
    cw_val = pd.DataFrame(CiteWorth['validation'])
    # we save the subsets locally
    cw_train.to_json('train.json')
    cw_test.to_json('test.json')
    cw_val.to_json('val.json')
  return cw_train, cw_test, cw_val

In [25]:
def format_data(df) :
  samplesList = [item for row in tqdm(df['samples'], total=len(df), desc="Iterating over rows") for item in row]
  formatted = pd.DataFrame(samplesList)
  formatted = formatted[['text','label']]
  formatted.label.replace({'non-check-worthy':0, 'check-worthy':1}, inplace=True)
  formatted.label = formatted.label.astype(np.int32)
  return formatted

In [26]:
def check_data(df) :
  variable_name = [name for name, var in inspect.currentframe().f_back.f_locals.items() if var is df][0]
  print("Summary of the dataframe : (total="+str(df.shape[0])+"):")
  print("unique data : ")
  display(df[['text',	'label']].nunique())
  print('--------------------------------------------------------------')
  print(f'Dataset shape : {df.shape[0]}')
  print('--------------------------------------------------------------')
  print('Number of non-check worthy : ',df[df['label'] == 0].shape[0])
  print('Number of check worthy : ',df[df['label'] == 1].shape[0])
  print('--------------------------------------------------------------')
  print('Number of NaN : ')
  print(df.isna().sum() )
  print('--------------------------------------------------------------')
  print(f'Number of duplicates : {df.duplicated().sum()}')
  print('In percentages',(df.duplicated().sum()/df.shape[0]*100),'%')
  print()
  display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)))
  print('--------------------------------------------------------------')
  print(f"Number of same text but different labels : {df.duplicated(subset='text',keep=False).sum()} ")
  print('In percentages',(df.duplicated(subset='text',keep=False).sum()/df.shape[0]*100),'%')
  print()
  display(df[df.duplicated(subset='text',keep=False)].sort_values(by=list(df.columns)))
  print('--------------------------------------------------------------')


In [27]:
def clean_data(df, full= False) :
  # Deleting the null objects
  df = df.dropna()
  df.reset_index(drop=True, inplace=True)
  # deleting the duplicates
  df.drop_duplicates(inplace=True)
  df.reset_index(drop=True, inplace=True)
  #deleting same text different labels
  if full :
    df.drop_duplicates(subset=['text'],keep= False, inplace=True)
    df.reset_index(drop=True, inplace=True)
  print ('Done !')
  return(df)

In [28]:
def plotting_data(train,test,val) :

  #General stuff
  colors = ["firebrick","yellowgreen"]
  labels = {0: "non check worthy (0)", 1: "check worthy (1)"}
  datasets = [train, test, val]
  titles = ['Train', 'Test', 'Validation']
  #Plotting
  fig, axs = plt.subplots(nrows=1,ncols=3, figsize=(12,12))

  for i, (dataset, title) in enumerate(zip(datasets, titles)):
      ax = axs[i]
      ax.pie(dataset['label'].value_counts().values,
               labels=[labels[label] for label in dataset['label'].value_counts().index],
               autopct=lambda x: str(round(x, 2)) + '%',
               colors=colors)
      for j, proportion in enumerate(dataset['label'].value_counts()):
        if j == 0 :
          ax.text(x=-0.50, y=0.25, s=str(round(proportion)))
        if j == 1 :
          ax.text(x=0.10, y=-0.75, s=str(round(proportion)))
      ax.set_title(title)

  axs[1].annotate("CiteWorth label distribution", xy=(0.5, -0.05), xycoords="axes fraction", ha="center", va="center", fontsize=16)
  plt.show()


In [72]:
def to_Xy(df, reshape=False) :
  if reshape :
    X = df.values[:,0].reshape(df.shape[0],1)
    y = df.values[:,1].reshape(df.shape[0],1)
    y = y.astype('int32')
  else :
    X = df.values[:,0]
    y = df.values[:,1]
    y = y.astype('int32')
  return X,y

## Experiment related functions


In [30]:
def exp_LR(train, test, C) :

  X_train,y_train = to_Xy(train)
  X_test,y_test = to_Xy(test)
  # Vectorizing
  print("Vectorizing data...")
  vectorizer = TfidfVectorizer()

  vectorizer.fit(X_train)

  Xtrain = vectorizer.transform(X_train)
  Xtest = vectorizer.transform(X_test)

  print("Data vectorized.")
  #Classifying
  print("Training classifier...")
  classifier = LogisticRegression(penalty='l2', C=C, warm_start=True, class_weight='balanced')
  classifier.fit(Xtrain, y_train)
  print("Classifier trained.")
  print("Predicting on test data...")
  predicts = classifier.predict(Xtest)
  print("Prediction complete.")
  print("Calculating metrics...")
  P,R,F1,_ = precision_recall_fscore_support(y_test, predicts, average='binary')
  # wandb.run.summary[f'test-P'] = P
  # wandb.run.summary[f'test-R'] = R
  # wandb.run.summary[f'test-F1'] = F1
  print(f'test-P : {P}')
  print(f'test-R : {R}')
  print(f'test-F1 : {F1}')


In [88]:
def exp_LR_2(ds1, ds2,  C, n_splits=10):

    print("Initializing..")
    precision_scores, recall_scores, f1_scores = [], [], []
    X_GIGA, y_GIGA = to_Xy(ds1)
    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_GIGA)
    Xtrain = vectorizer.transform(X_GIGA)

    # Classifying with k-fold cross-validation
    print("Training classifier with k-fold cross-validation...")
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for fold_num, (train_index, test_index) in enumerate(kf.split(ds2), start = 1):
      print("###########################################################################################################################")
      print(f"\r\r\r\r\r Fold {fold_num}/{10} ")
      print("###########################################################################################################################")
    # Split the data into training and test sets for this fold
      print('splitting..')
      train_dset_fold = ds2.iloc[train_index]
      test_dset_fold = ds2.iloc[test_index]
    #create a vlaidation subsets :

      X_test,y_test = to_Xy(test_dset_fold)
    # Initialize the vectorizer
      print('vectorizing..')
    #vectorizing
      Xtest = vectorizer.transform(X_test)
    #classifying
      print('classifying..')
      classifier = LogisticRegression(penalty='l2', C=C, warm_start=True, class_weight='balanced')
      classifier.fit(Xtrain, y_GIGA)

      predicts = classifier.predict(Xtest)
      precision, recall, f1, _ = precision_recall_fscore_support(y_test, predicts, average='binary')
      precision_scores.append(precision)
      recall_scores.append(recall)
      f1_scores.append(f1)
      print(f"{fold_num} Done !")

    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)
    mean_f1 = np.mean(f1_scores)

    print("Mean Precision :", mean_precision)
    print("Mean Recall : ", mean_recall)
    print("Mean F1 score : ", mean_f1)
    print("Precision scores for each fold : ", precision_scores)
    print("Recall scores for each fold : ", recall_scores)
    print("F1 scores for each fold : ", f1_scores)

In [77]:
def exp_LR_3(ds1, C, n_splits=10):

    precision_scores, recall_scores, f1_scores = [], [], []

    # Classifying with k-fold cross-validation
    print("Training classifier with k-fold cross-validation...")
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for fold_num, (train_index, test_index) in enumerate(kf.split(ds1), start = 1):
      print("###########################################################################################################################")
      print(f"\r\r\r\r\r Fold {fold_num}/{10} ")
      print("###########################################################################################################################")
    # Split the data into training and test sets for this fold
      print('splitting..')
      train_dset_fold = ds1.iloc[train_index]
      test_dset_fold = ds1.iloc[test_index]
    #create subsets :
      X_train,y_train = to_Xy(train_dset_fold)
      X_test,y_test = to_Xy(test_dset_fold)
    # Initialize the vectorizer
      print('vectorizing..')
      vectorizer = TfidfVectorizer()
    #vectorizing
      vectorizer.fit(X_train)

      Xtrain = vectorizer.transform(X_train)
      Xtest = vectorizer.transform(X_test)
    #classifying
      print('classifying..')
      classifier = LogisticRegression(penalty='l2', C=C, warm_start=True, class_weight='balanced')
      classifier.fit(Xtrain, y_train)

      predicts = classifier.predict(Xtest)
      precision, recall, f1, _ = precision_recall_fscore_support(y_test, predicts, average='binary')
      precision_scores.append(precision)
      recall_scores.append(recall)
      f1_scores.append(f1)
      print(f"{fold_num} Done !")
    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)
    mean_f1 = np.mean(f1_scores)

    print("Mean Precision :", mean_precision)
    print("Mean Recall : ", mean_recall)
    print("Mean F1 score : ", mean_f1)
    print("Precision scores for each fold : ", precision_scores)
    print("Recall scores for each fold : ", recall_scores)
    print("F1 scores for each fold : ", f1_scores)

# Data

In [34]:
## Use this Cell to position yourself at your data folder
#%cd Use This cell to access your data folder
%ls

[Errno 2] No such file or directory: 'drive/MyDrive/Stage_Wassim_2024/Git_mock/Datasets'
/content/drive/MyDrive/Stage_Wassim_2024/Git_mock/Datasets
test.jsonl  train.jsonl  twt_FULL.jsonl  val.jsonl


In [35]:
train  = read_citation_detection_jsonl_single_line('train.jsonl')
test = read_citation_detection_jsonl_single_line('test.jsonl')
val = read_citation_detection_jsonl_single_line('val.jsonl')

train_clean = clean_data(train)
test_clean = clean_data(test)
val_clean = clean_data(val)
train_clean = clean_data(train, full=True)
val_clean = clean_data(val, full=True)

twtfull = read_citation_detection_jsonl_single_line('twt_FULL.jsonl')

Processing data: 100%|██████████| 169015/169015 [00:01<00:00, 89762.22it/s] 
Processing data: 100%|██████████| 20995/20995 [00:00<00:00, 45694.04it/s]
Processing data: 100%|██████████| 20990/20990 [00:00<00:00, 45239.14it/s]


Done !
Done !
Done !
Done !
Done !


Processing data: 100%|██████████| 415/415 [00:00<00:00, 310883.40it/s]


# Experiments

## Experiment 1

In [17]:
exp_LR(train_clean,test_clean, C = 0.1151)

Done !
Done !
Vectorizing data...
Data vectorized.
Training classifier...
Classifier trained.
Predicting on test data...
Prediction complete.
Calculating metrics...
test-P : 0.46659173707464535
test-R : 0.6485656565656566
test-F1 : 0.5427312378140179


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Experiment 2


In [89]:
exp_LR_2(train_clean, twtfull, C = 0.1151)

Initializing..
Training classifier with k-fold cross-validation...
###########################################################################################################################
 Fold 1/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


1 Done !
###########################################################################################################################
 Fold 2/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


2 Done !
###########################################################################################################################
 Fold 3/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


3 Done !
###########################################################################################################################
 Fold 4/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


4 Done !
###########################################################################################################################
 Fold 5/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


5 Done !
###########################################################################################################################
 Fold 6/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


6 Done !
###########################################################################################################################
 Fold 7/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


7 Done !
###########################################################################################################################
 Fold 8/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


8 Done !
###########################################################################################################################
 Fold 9/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


9 Done !
###########################################################################################################################
 Fold 10/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..
10 Done !
Mean Precision : 0.4938489935491887
Mean Recall :  0.47429661535126605
Mean F1 score :  0.476742257570664
Precision scores for each fold :  [0.4166666666666667, 0.5833333333333334, 0.47368421052631576, 0.64, 0.34782608695652173, 0.35294117647058826, 0.46153846153846156, 0.6, 0.5625, 0.5]
Recall scores for each fold :  [0.5, 0.6363636363636364, 0.4090909090909091, 0.6153846153846154, 0.47058823529411764, 0.35294117647058826, 0.2857142857142857, 0.391304347826087, 0.45, 0.631578947368421]
F1 scores for each fold :  [0.45454545454545453, 0.6086956521739131, 0.43902439024390244, 0.6274509803921569, 0.39999999999999997, 0.35294117647058826, 0.35294117647058826, 0.47368421

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Experiment 3

In [93]:
exp_LR_3(twtfull, C = 0.1151)

Training classifier with k-fold cross-validation...
###########################################################################################################################
 Fold 1/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..
1 Done !
###########################################################################################################################
 Fold 2/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..
2 Done !
###########################################################################################################################
 Fold 3/10 
###########################################################################################################################
splitting..
vectorizing..
classifying..
3 Done !
######