In [1]:
!pip install transformers[torch]
!pip install datasets
!pip install sentence_transformers
!pip install numba

import pandas as pd
import numpy as np
from numba import cuda
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from tqdm import tqdm
import torch



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df1=pd.read_csv(r'/content/drive/MyDrive/Trailadata_preprocessed.csv')

df2=pd.read_csv(r'/content/drive/MyDrive/preprocessedexp_dataset.csv')

In [4]:
df1.head()

Unnamed: 0,File Name,Transcript,Label,tokenized_Transcript,clean_text
0,trial_truth_002.txt,"All of us, who have represented people for yea...",truthful,all of us who have represented people for year...,represent people year system letter prisioner ...
1,trial_truth_001.txt,Reforming the criminal justice system matters ...,truthful,reforming the criminal justice system matters ...,reform criminal justice system matter law legi...
2,trial_truth_006.txt,They really didn't tell me anything. They said...,truthful,they really did n't tell me anything they said...,tell say accident say accident ian fine accide...
3,trial_truth_004.txt,"I do. I was, uh... in the office, and I got a ...",truthful,i do i was uh in the office and i got a call u...,uh office get uh maybe close o clock remember ...
4,trial_truth_003.txt,"But yes, I was there. Yep, I stayed. Uh ... Ye...",truthful,but yes i was there yep i stayed uh yep prob y...,yes yep stay uh yep prob yes yes


In [5]:
df1.drop(columns=['File Name','Transcript','tokenized_Transcript'],inplace=True)

In [6]:
df1['Type'] = 'A'

In [7]:
df2['Type'] = 'B'

In [8]:
df2.drop(columns=['File Name','Transcript','tokenized_Transcript'],inplace=True)

In [9]:
df1.head()

Unnamed: 0,Label,clean_text,Type
0,truthful,represent people year system letter prisioner ...,A
1,truthful,reform criminal justice system matter law legi...,A
2,truthful,tell say accident say accident ian fine accide...,A
3,truthful,uh office get uh maybe close o clock remember ...,A
4,truthful,yes yep stay uh yep prob yes yes,A


In [10]:
df2.head()

Unnamed: 0.1,Unnamed: 0,Label,clean_text,Type
0,0,truthful,recently visit paris moment feel like scene ro...,B
1,1,truthful,step fruit u s o time accelerate travel san fr...,B
2,2,truthful,trip u s incredible landing new york city feel...,B
3,3,truthful,visit usa time like enter new world mention me...,B
4,4,truthful,step plane usa time fill anticipation visit wa...,B


In [11]:
combined_df = pd.concat([df1, df2], ignore_index=True)

In [12]:
combined_df.head()

Unnamed: 0.1,Label,clean_text,Type,Unnamed: 0
0,truthful,represent people year system letter prisioner ...,A,
1,truthful,reform criminal justice system matter law legi...,A,
2,truthful,tell say accident say accident ian fine accide...,A,
3,truthful,uh office get uh maybe close o clock remember ...,A,
4,truthful,yes yep stay uh yep prob yes yes,A,


In [13]:
combined_df.shape

(1147, 4)

In [14]:
combined_df.drop(columns=['Unnamed: 0'],inplace=True)

In [15]:
combined_df.head()

Unnamed: 0,Label,clean_text,Type
0,truthful,represent people year system letter prisioner ...,A
1,truthful,reform criminal justice system matter law legi...,A
2,truthful,tell say accident say accident ian fine accide...,A
3,truthful,uh office get uh maybe close o clock remember ...,A
4,truthful,yes yep stay uh yep prob yes yes,A


In [16]:
import spacy
from collections import Counter

!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [17]:
# Load Spacy's English language model
nlp = spacy.load("en_core_web_lg")

# Define a function to tokenize text
def tokenize(txt):
    # Remove punctuation and double space, and convert to lowercase:
    tokens = [token.text.lower() for token in txt if not (token.is_punct or token.is_space)]
    # Join tokens back into a string
    tokens = " ".join(tokens)
    return tokens

In [18]:
# Apply the preprocessing pipeline using nlp.pipe
tokenized_txt = []

for txt in nlp.pipe(combined_df['clean_text']):
    tokenized_txt.append(tokenize(txt))

# Add the preprocessed text as a new column in the dataframe
combined_df['tokenized_Transcript'] = tokenized_txt

combined_df.drop(columns=['clean_text'],inplace=True)

In [19]:
combined_df.head()

Unnamed: 0,Label,Type,tokenized_Transcript
0,truthful,A,represent people year system letter prisioner ...
1,truthful,A,reform criminal justice system matter law legi...
2,truthful,A,tell say accident say accident ian fine accide...
3,truthful,A,uh office get uh maybe close o clock remember ...
4,truthful,A,yes yep stay uh yep prob yes yes


In [27]:
combined_df['Label'] = combined_df['Label'].map({'truthful': 1, 'decpetive': 0})
combined_df['Label'].fillna(0, inplace=True)
combined_df['Label'] = combined_df['Label'].astype(int)
from transformers import T5Tokenizer, T5ForConditionalGeneration
# Preprocess function to tokenize input text and format labels
def preprocess_function(examples):
    inputs = examples["tokenized_Transcript"]
    targets = examples["Label"]

    # Convert labels to lists
    targets = [[label] for label in targets]

    # Ensure that inputs are lists of strings
    if isinstance(inputs, str):
        inputs = [inputs]

    # Tokenize inputs
    inputs_tokenized = tokenizer(inputs, padding=True, truncation=True)['input_ids']

    return {'input_ids': inputs_tokenized, 'labels': targets}


def create_dataset(df1, seed=42, cv=10):
    train = {f'split_{i+1}': [] for i in range(cv)}
    test = {f'split_{i+1}': [] for i in range(cv)}

    np.random.seed(seed)

    for k in train.keys():
        train[k] = combined_df.sample(frac=0.8, random_state=seed)
        test[k] = combined_df.drop(train[k].index)

    return train, test
seed = 42
numcv = 10
num_epochs = 3
model_size = 'small'
!pip install --upgrade transformers accelerate
import accelerate
from sklearn.metrics import accuracy_score
train, test = create_dataset(combined_df, seed=seed, cv=numcv)
results = {}
collect_result = {}
ground_truth = {}
# Define the device
device = "cuda" if torch.cuda.is_available() else "cpu"

NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [24]:
from transformers import AutoModelForSeq2SeqLM
for sp in tqdm(train.keys()):
    data_train = Dataset.from_pandas(train[sp])
    data_test = Dataset.from_pandas(test[sp])
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
    # Tokenize and preprocess data
    data_train = data_train.map(preprocess_function, batched=True)
    data_test = data_test.map(preprocess_function, batched=True)

    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",  # The output directory
        logging_dir="./logs",
        evaluation_strategy="epoch",  # Evaluation is done at the end of each epoch
        learning_rate=5e-4,
        per_device_train_batch_size=2,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=num_epochs,
        predict_with_generate=True,
        fp16=False,
        push_to_hub=False,
    )
    # Define trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=data_train,
        eval_dataset=data_test,
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()
    # Get predictions
    predictions = trainer.predict(data_test)
    pred_labels = np.argmax(predictions.predictions, axis=1)

    # Store predictions in the original DataFrame
    combined_df.loc[test[sp].index, 'Prediction'] = pred_labels

    # Clean up
    del model
    torch.cuda.empty_cache()



  0%|          | 0/10 [00:00<?, ?it/s]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.098547
2,3.264500,1.104169
3,1.500500,1.723615




 10%|█         | 1/10 [02:49<25:22, 169.13s/it]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.098547
2,3.264500,1.104169
3,1.500500,1.723615




 20%|██        | 2/10 [05:15<20:44, 155.51s/it]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.098547
2,3.264500,1.104169
3,1.500500,1.723615




 30%|███       | 3/10 [07:42<17:42, 151.79s/it]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.098547
2,3.264500,1.104169
3,1.500500,1.723615




 40%|████      | 4/10 [10:08<14:58, 149.68s/it]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.098547
2,3.264500,1.104169
3,1.500500,1.723615




 50%|█████     | 5/10 [12:36<12:24, 148.85s/it]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.098547
2,3.264500,1.104169
3,1.500500,1.723615




 60%|██████    | 6/10 [15:15<10:08, 152.24s/it]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.098547
2,3.264500,1.104169
3,1.500500,1.723615




 70%|███████   | 7/10 [17:48<07:37, 152.56s/it]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.098547
2,3.264500,1.104169
3,1.500500,1.723615




 80%|████████  | 8/10 [20:24<05:07, 153.79s/it]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.098547
2,3.264500,1.104169
3,1.500500,1.723615




 90%|█████████ | 9/10 [23:14<02:38, 158.86s/it]

Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.098547
2,3.264500,1.104169
3,1.500500,1.723615




100%|██████████| 10/10 [25:45<00:00, 154.55s/it]


In [31]:
# Display final DataFrame with predictions
combined_df['Prediction'].fillna(0, inplace=True)
print("Final DataFrame with Predictions:")
print(combined_df)

# Assuming ground_truth is defined outside the loop
# Calculate accuracy
ground_truth = combined_df.loc[test[sp].index,'Label']
test_accuracy = accuracy_score(ground_truth, combined_df.loc[test[sp].index, 'Prediction'])
print("Test Accuracy:", test_accuracy)

Final DataFrame with Predictions:
      Label Type                               tokenized_Transcript  \
0         0    A  represent people year system letter prisioner ...   
1         0    A  reform criminal justice system matter law legi...   
2         0    A  tell say accident say accident ian fine accide...   
3         0    A  uh office get uh maybe close o clock remember ...   
4         0    A                   yes yep stay uh yep prob yes yes   
...     ...  ...                                                ...   
1142      0    B  travel experience visit dark hedge northern ir...   
1143      0    B  travel experience visit muchu picchu peru actu...   
1144      0    B  travel experience visit hallstatt austria actu...   
1145      0    B  trip visit jiuzhaigou valley china actually na...   
1146      0    B  trip visit bagan myanmar actually old city bud...   

      Prediction  
0            0.0  
1            0.0  
2            0.0  
3            0.0  
4            0.0  

In [32]:
file_path = 'CombinedSmall.csv'
df1.to_csv(file_path, index=False)

In [33]:
"""# Display final DataFrame with predictions
combined_df['Prediction'].fillna(0, inplace=True)
print("Final DataFrame with Predictions:")
print(combined_df)

# Assuming ground_truth is defined outside the loop
# Calculate accuracy
ground_truth = combined_df.loc[test[sp].index,'Label']
test_accuracy = accuracy_score(ground_truth, combined_df.loc[test[sp].index, 'Prediction'])
print("Test Accuracy:", test_accuracy)"""

'# Display final DataFrame with predictions\ncombined_df[\'Prediction\'].fillna(0, inplace=True)\nprint("Final DataFrame with Predictions:")\nprint(combined_df)\n\n# Assuming ground_truth is defined outside the loop\n# Calculate accuracy\nground_truth = combined_df.loc[test[sp].index,\'Label\']\ntest_accuracy = accuracy_score(ground_truth, combined_df.loc[test[sp].index, \'Prediction\'])\nprint("Test Accuracy:", test_accuracy)'

In [34]:
"""# Display final DataFrame with predictions
combined_df['Prediction'].fillna(0, inplace=True)
print("Final DataFrame with Predictions:")
print(combined_df)

# Assuming ground_truth is defined outside the loop
# Calculate accuracy
ground_truth = combined_df.loc[test[sp].index,'Label']
test_accuracy = accuracy_score(ground_truth, combined_df.loc[test[sp].index, 'Prediction'])
print("Test Accuracy:", test_accuracy)"""

'# Display final DataFrame with predictions\ncombined_df[\'Prediction\'].fillna(0, inplace=True)\nprint("Final DataFrame with Predictions:")\nprint(combined_df)\n\n# Assuming ground_truth is defined outside the loop\n# Calculate accuracy\nground_truth = combined_df.loc[test[sp].index,\'Label\']\ntest_accuracy = accuracy_score(ground_truth, combined_df.loc[test[sp].index, \'Prediction\'])\nprint("Test Accuracy:", test_accuracy)'

In [36]:
"""# Display final DataFrame with predictions
combined_df['Prediction'].fillna(0, inplace=True)
print("Final DataFrame with Predictions:")
print(combined_df)"""

'# Display final DataFrame with predictions\ncombined_df[\'Prediction\'].fillna(0, inplace=True)\nprint("Final DataFrame with Predictions:")\nprint(combined_df)'