In [1]:
!pip install transformers[torch]
!pip install datasets
!pip install sentence_transformers
!pip install numba

import pandas as pd
import numpy as np
from numba import cuda
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm import tqdm
import torch


Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [2]:
from google.colab import drive
drive.mount('/content/drive')

df1=pd.read_csv(r'/content/drive/MyDrive/preprocessedexp_dataset.csv')
df1.head()
df1.drop(columns=['File Name','Transcript','tokenized_Transcript'],inplace=True)

df1.head()


Mounted at /content/drive


Unnamed: 0.1,Unnamed: 0,Label,clean_text
0,0,truthful,recently visit paris moment feel like scene ro...
1,1,truthful,step fruit u s o time accelerate travel san fr...
2,2,truthful,trip u s incredible landing new york city feel...
3,3,truthful,visit usa time like enter new world mention me...
4,4,truthful,step plane usa time fill anticipation visit wa...


In [3]:
import spacy
from collections import Counter
!python -m spacy download en_core_web_lg
# Load Spacy's English language model
nlp = spacy.load("en_core_web_lg")


Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
# Define a function to tokenize text
def tokenize(txt):
    # Remove punctuation and double space, and convert to lowercase:
    tokens = [token.text.lower() for token in txt if not (token.is_punct or token.is_space)]
    # Join tokens back into a string
    tokens = " ".join(tokens)
    return tokens

# Apply the preprocessing pipeline using nlp.pipe
tokenized_txt = []

for txt in nlp.pipe(df1['clean_text']):
    tokenized_txt.append(tokenize(txt))



In [5]:

# Add the preprocessed text as a new column in the dataframe
df1['tokenized_Transcript'] = tokenized_txt
df1.head()
df1.drop(columns=['clean_text','Unnamed: 0'],inplace=True)
df1.head()
df1.isna().sum()
df1['Label'] = df1['Label'].map({'truthful': 1, 'decpetive': 0})
df1['Label'].fillna(0, inplace=True)
df1['Label'] = df1['Label'].astype(int)
from transformers import T5Tokenizer, T5ForConditionalGeneration
# Preprocess function to tokenize input text and format labels
def preprocess_function(examples):
    inputs = examples["tokenized_Transcript"]
    targets = examples["Label"]

    # Convert labels to lists
    targets = [[label] for label in targets]

    # Ensure that inputs are lists of strings
    if isinstance(inputs, str):
        inputs = [inputs]

    # Tokenize inputs
    inputs_tokenized = tokenizer(inputs, padding=True, truncation=True)['input_ids']

    return {'input_ids': inputs_tokenized, 'labels': targets}




In [6]:
def create_dataset(df1, seed=42, cv=10):
    train = {f'split_{i+1}': [] for i in range(cv)}
    test = {f'split_{i+1}': [] for i in range(cv)}

    np.random.seed(seed)

    for k in train.keys():
        train[k] = df1.sample(frac=0.8, random_state=seed)
        test[k] = df1.drop(train[k].index)

    return train, test


In [7]:
seed = 42
numcv = 10
num_epochs = 3
model_size = 'large'
!pip install accelerate
import accelerate
from sklearn.metrics import accuracy_score
train, test = create_dataset(df1, seed=seed, cv=numcv)
results = {}
collect_result = {}
ground_truth = {}




In [8]:
# Define the device
device = "cuda" if torch.cuda.is_available() else "cpu"
for sp in tqdm(train.keys()):
    data_train = Dataset.from_pandas(train[sp])
    data_test = Dataset.from_pandas(test[sp])
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto")
    # Tokenize and preprocess data
    data_train = data_train.map(preprocess_function, batched=True)
    data_test = data_test.map(preprocess_function, batched=True)

    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",  # The output directory
        logging_dir="./logs",
        evaluation_strategy="epoch",  # Evaluation is done at the end of each epoch
        learning_rate=5e-4,
        per_device_train_batch_size=2,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        predict_with_generate=True,
        fp16=False,
        push_to_hub=False,
    )
    # Define trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=data_train,
        eval_dataset=data_test,
        tokenizer=tokenizer,
    )
        # Train the model
    trainer.train()
    # Get predictions
    predictions = trainer.predict(data_test)
    pred_labels = np.argmax(predictions.predictions, axis=1)

    # Store predictions in the original DataFrame
    df1.loc[test[sp].index, 'Prediction'] = pred_labels
    df1['Prediction'].fillna(0, inplace=True)

    # Clean up
    del model
    torch.cuda.empty_cache()





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.641431
2,2.433300,1.423164
3,1.124000,0.653452




 10%|█         | 1/10 [13:03<1:57:35, 783.98s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.641431
2,2.433300,1.423164
3,1.124000,0.653452


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


 20%|██        | 2/10 [25:50<1:43:10, 773.86s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.641431
2,2.433300,1.423164
3,1.124000,0.653452


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


 30%|███       | 3/10 [38:36<1:29:50, 770.03s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.641431
2,2.433300,1.423164
3,1.124000,0.653452


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


 40%|████      | 4/10 [51:23<1:16:54, 769.10s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.641431
2,2.433300,1.423164
3,1.124000,0.653452


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


 50%|█████     | 5/10 [1:04:02<1:03:46, 765.33s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.641431
2,2.433300,1.423164
3,1.124000,0.653452


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


 60%|██████    | 6/10 [1:16:44<50:56, 764.17s/it]  Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.641431
2,2.433300,1.423164


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Epoch,Training Loss,Validation Loss
1,No log,0.641431
2,2.433300,1.423164
3,1.124000,0.653452




 70%|███████   | 7/10 [1:29:25<38:09, 763.08s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.641431
2,2.433300,1.423164
3,1.124000,0.653452


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


 80%|████████  | 8/10 [1:42:03<25:23, 761.66s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.641431
2,2.433300,1.423164
3,1.124000,0.653452


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


 90%|█████████ | 9/10 [1:54:44<12:41, 761.38s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Map:   0%|          | 0/205 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.641431
2,2.433300,1.423164
3,1.124000,0.653452


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


100%|██████████| 10/10 [2:07:21<00:00, 764.14s/it]


In [10]:
# Display final DataFrame with predictions
print("Final DataFrame with Predictions:")
print(df1)

# Assuming ground_truth is defined outside the loop
# Calculate accuracy
ground_truth = df1.loc[test[sp].index, 'Label']
test_accuracy = accuracy_score(ground_truth, df1.loc[test[sp].index, 'Prediction'])
print("Test Accuracy:",test_accuracy)

Final DataFrame with Predictions:
      Label                               tokenized_Transcript  Prediction
0         1  recently visit paris moment feel like scene ro...         0.0
1         1  step fruit u s o time accelerate travel san fr...         0.0
2         1  trip u s incredible landing new york city feel...         0.0
3         1  visit usa time like enter new world mention me...         0.0
4         1  step plane usa time fill anticipation visit wa...         0.0
...     ...                                                ...         ...
1021      0  travel experience visit dark hedge northern ir...         0.0
1022      0  travel experience visit muchu picchu peru actu...         0.0
1023      0  travel experience visit hallstatt austria actu...         0.0
1024      0  trip visit jiuzhaigou valley china actually na...         0.0
1025      0  trip visit bagan myanmar actually old city bud...         0.0

[1026 rows x 3 columns]
Test Accuracy: 0.6926829268292682


In [12]:
"""# Display final DataFrame with predictions
print("Final DataFrame with Predictions:")
print(df1)

# Assuming ground_truth is defined outside the loop
# Calculate accuracy
ground_truth = df1.loc[test[sp].index, 'Label']
test_accuracy = accuracy_score(ground_truth, df1.loc[test[sp].index, 'Prediction'])
print("Test Accuracy:", test_accuracy)"""

'# Display final DataFrame with predictions\nprint("Final DataFrame with Predictions:")\nprint(df1)\n\n# Assuming ground_truth is defined outside the loop\n# Calculate accuracy\nground_truth = df1.loc[test[sp].index, \'Label\']\ntest_accuracy = accuracy_score(ground_truth, df1.loc[test[sp].index, \'Prediction\'])\nprint("Test Accuracy:",\xa0test_accuracy)'