# **Testing Fine-tuning & Data Preparation method**

> Now we want to make sure that our method is appropriate for our senior project

- LoRA fine-tuning or Freezing Layer or Full-parameter fine-tuning.
- Deleting the AI-Generated structure will effect the performance or not.
- Data Preparation like lowercase or delete the stop word will effect the performance or not



# Import dataset and library

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/kaggle/input/daigt-v3-train-dataset/train_v3_drcat_01.csv')
df.head()

# Delete Duplicate row and missing values

In [None]:
#Check many possible ways that It will be null

df[df['text'].isnull()].head()

In [None]:
df[df['text'].apply(lambda x: isinstance(x, str) and x.strip() == '')].head()

In [None]:
# Drop rows where 'text' is NaN or None
df = df.dropna(subset=['text'])

# Drop rows where 'text' is an empty string or whitespace
df = df[df['text'].str.strip() != '']

In [None]:
#Checking that there is no more missing value
df.info()

In [None]:
# Detect duplicates that in our dataset has duplicated or not.
df[df.duplicated() == True].shape

In [None]:
#There are '\n' in most of essays. The reason maybe that It is new line (ขึ้นบรรทัดใหม่)
df[df['text'].str.contains('\n') == True].shape

In [None]:
#There are '\n' in most of essays. The reason maybe that It is new line (ขึ้นบรรทัดใหม่)
df[df['text'].str.contains('\r') == True].shape

In [None]:
#There are '\n' in most of essays. The reason maybe that It is new line (ขึ้นบรรทัดใหม่)
df[df['text'].str.contains('\t') == True].shape

In [None]:
#We try to replace '\n' with ' '
df['text'] = df['text'].str.replace('\n', ' ', regex=False)
df['text'] = df['text'].str.replace('\r', ' ', regex=False)
df['text'] = df['text'].str.replace('\t', ' ', regex=False)
df.head()

In [None]:
#df.to_csv('no_missing_dataset.csv',index = False)

In [None]:
df.shape

# 1.1 Deleting the AI-Structure dataset

In [None]:
df1 = df.copy()
df1.shape

## Please provide feedback

> Since after the AI generated the essay, it will close like "please provide feedback", etc. which the author didn't delete those phase. So we need to delete the phase after those words.







In [None]:
import re
patterns = [
        "Please grade this",
        "Please provide feedback",]

# Combine patterns into a single regex pattern using alternation (|)
combined_pattern = '|'.join([re.escape(pattern) for pattern in patterns])

# Filter the DataFrame for rows where 'text' contains any of the patterns
df1[df1['text'].str.contains(combined_pattern, regex=True, case=False)].shape

In [None]:
#Check the result that we got.
df1[df1['text'].str.contains(combined_pattern, regex=True, case=False)].head()

In [None]:
df1.iloc[26924,0]

In [None]:
# Apply the replacement
df1['text'] = df1['text'].apply(lambda x: re.split(combined_pattern, x)[0] if isinstance(x, str) else x)

In [None]:
#Check the row that has change
df1.iloc[26924,0]

## "Introduction" "Body" "Conclusion"


> From AI-Generated, they always come with those words


In [None]:
patterns = [
        r"Introduction:",
        r"Body:",
        r"Conclusion:",
        r"Claim:",
        r"Evidence:",
        r"Evidence from the article:",
        r"Title:"
    ]

# Combine patterns into a single regex pattern using alternation (|)
combined_pattern = '|'.join([re.escape(pattern) for pattern in patterns])

# Filter the DataFrame for rows where 'text' contains any of the patterns
df1[df1['text'].str.contains(combined_pattern, regex=True, case=False)].shape

In [None]:
# Filter the DataFrame for rows where 'text' contains any of the patterns
df1[(df1['text'].str.contains(combined_pattern, regex=True, case=False)) & (df['label'] == 1)].shape

In [None]:
# Filter the DataFrame for rows where 'text' contains any of the patterns
df1[(df1['text'].str.contains(combined_pattern, regex=True, case=False)) & (df['label'] == 1)].head(100)

In [None]:
# Function to remove the patterns from text
def remove_patterns(text):
    return re.sub(combined_pattern, '', text, flags=re.IGNORECASE).strip()

# Apply the function to the rows that match the condition without checking the label
df1.loc[df1['text'].str.contains(combined_pattern, regex=True, case=False), 'text'] = \
    df1.loc[df1['text'].str.contains(combined_pattern, regex=True, case=False), 'text'].apply(remove_patterns)

# Filter the DataFrame for rows where 'text' contains any of the patterns
df1[df1['text'].str.contains(combined_pattern, regex=True, case=False)].shape

In [None]:
df1[df1['text'].str.contains(combined_pattern, regex=True, case=False) & (df1['label'] == 1)].shape

In [None]:
df1[df1['text'].str.contains(combined_pattern, regex=True, case=False) & (df1['label'] == 0)].head(1)

In [None]:
patterns = [
        r"Hook:",
        r"Topic Sentence:",
        r"Thesis Statement:",
        r"Ending the paragraph:",
        r"1st paragraph:",
        r"1st paragraph",
        r"Point:",
        r"Example:",
        r"Explain:",
        r"2nd Paragraph:",
        r"2nd Paragraph",
        r"3rd Paragraph:",
        r"3rd Paragraph",
        r"4th Paragraph:",
        r"4th Paragraph",
        r"5th Paragraph  Reintroduce Thesis Statement:",
        r"Closing:"
    ]

# Combine patterns into a single regex pattern using alternation (|)
combined_pattern = '|'.join([re.escape(pattern) for pattern in patterns])

# Filter the DataFrame for rows where 'text' contains any of the patterns
df1[df1['text'].str.contains(combined_pattern, regex=True, case=False)].shape

In [None]:
# Filter the DataFrame for rows where 'text' contains any of the patterns
df1[df1['text'].str.contains(combined_pattern, regex=True, case=False)]['label'].value_counts()

In [None]:
df1.iloc[306,0]

In [None]:
# Function to remove the patterns from text
def remove_patterns(text):
    return re.sub(combined_pattern, '', text, flags=re.IGNORECASE).strip()

# Apply the function to the rows that match the condition without checking the label
df1.loc[df1['text'].str.contains(combined_pattern, regex=True, case=False), 'text'] = \
    df1.loc[df1['text'].str.contains(combined_pattern, regex=True, case=False), 'text'].apply(remove_patterns)

# Filter the DataFrame for rows where 'text' contains any of the patterns
df1[df1['text'].str.contains(combined_pattern, regex=True, case=False)].shape

In [None]:
df1.iloc[306,0]

## [Name]


> We deleted rows that has [Name] out



In [None]:
'''pattern = "Name]"
# Filter the DataFrame for rows where 'text' contains any of the patterns
df1[df1['text'].str.contains(pattern, regex=True, case=False)].shape'''

In [None]:
'''df1[df1['text'].str.contains(pattern, regex=True, case=False)].head(100)'''

In [None]:
# Delete rows where 'text' contains the pattern
#df1 = df1[~df1['text'].str.contains(pattern, regex=True, case=False)]

In [None]:
# Filter the DataFrame for rows where 'text' contains any of the patterns
#df1[df1['text'].str.contains(pattern, regex=True, case=False)].shape

In [None]:
# Reset the index of the DataFrame
#df1 = df1.reset_index(drop=True)

In [None]:
#df1

## Note:

In [None]:
pattern = "Note:"

# Filter the DataFrame for rows where 'text' contains any of the patterns
df1[df1['text'].str.contains(pattern, regex=True, case=False)].shape

In [None]:
df1[df1['text'].str.contains(pattern, regex=True, case=False)]

In [None]:
def remove_note(text):
    """
    Removes everything after "Note:" or "Please note:" in a case-insensitive manner.
    """
    # Define the patterns to match "Note:" or "Please note:" case-insensitively
    patterns = [
        r"Note:",
        r"Please note:"
    ]

    # Combine patterns into a single regex with case-insensitive flag
    combined_pattern = '|'.join(patterns)

    # Search for the pattern in a case-insensitive way
    match = re.search(combined_pattern, text, flags=re.IGNORECASE)

    if match:
        # Slice the text to remove everything after the match
        text = text[:match.start()]

    return text.strip()

# Apply the remove_name function to the 'text' column
df1['text'] = df1['text'].apply(remove_note)
df1[df1['text'].str.contains(pattern, regex=True, case=False)].shape

In [None]:
df1.iloc[57020,0]

In [None]:
df1[(df1['label'] == 1) & (df1['text'].str.contains(r'--', regex=True))].head(1000)

## The dash '--'

In [None]:
# Replace more than one consecutive '-' with an empty string
df1['text'] = df1['text'].str.replace(r'-{2,}', '', regex=True)

In [None]:
df1[(df1['label'] == 1) & (df1['text'].str.contains(r'--', regex=True))].shape

In [None]:
df1.shape

# 1.2 Lowercase all text dataset

In [None]:
df2 = df.copy()

In [None]:
# Lowercase the data in 'text_column'
df2['text'] = df2['text'].str.lower()

df2.head()

# 1.3 Removing all Punctuation & Special Characters

In [None]:
df3 = df.copy()

In [None]:
#library that contains punctuation
import string
string.punctuation

In [None]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [None]:
#storing the puntuation free text
df3['text']= df3['text'].apply(lambda x:remove_punctuation(x))
df3.head()

# 1.4 Both lowercase and remove punctuation

In [None]:
df4 = df.copy()

In [None]:
# Lowercase the data in 'text_column'
df4['text'] = df4['text'].str.lower()

df4.head()

In [None]:
#storing the puntuation free text
df4['text']= df4['text'].apply(lambda x:remove_punctuation(x))
df4.head()

# 1.5 Lowercase and Delete AI

In [None]:
df5 = df1.copy()

In [None]:
# Lowercase the data in 'text_column'
df5['text'] = df5['text'].str.lower()

df5.head()

# 1.6 Remove punctuation and delete AI

In [None]:
df6 = df1.copy()

In [None]:
#storing the puntuation free text
df6['text']= df6['text'].apply(lambda x:remove_punctuation(x))
df6.head()

# 1.7 All Remove AI,punctuation and lowercase

In [None]:
df7 = df6.copy()

In [None]:
# Lowercase the data in 'text_column'
df7['text'] = df7['text'].str.lower()

df7.head()

# Comparing all 7 types of dataset

## Training / Validation / Test set split

In [None]:
print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)
print(df5.shape)
print(df6.shape)
print(df7.shape)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming your 7 DataFrames are stored in a list
dfs = [df1, df2, df3, df4, df5, df6, df7]

# Iterate over each DataFrame and process them
for i, df in enumerate(dfs, start=1):
    # Select relevant columns
    df = df[['text', 'label']]

    # Sample 15,000 records from the DataFrame
    sampled_df = df.sample(n=15000, random_state=2092024)

    # Define outcome name
    outcomename = 'label'

    # Redefine X and Y after sampling
    X_sampled = sampled_df.drop(columns=outcomename)
    Y_sampled = sampled_df[outcomename]

    # Split into training and combined validation-test sets (80% train, 20% valid/test)
    X_train, X_valid_test, y_train, y_valid_test = train_test_split(
        X_sampled, Y_sampled, test_size=0.2, random_state=2092024, stratify=Y_sampled
    )

    # Split the combined validation-test set into separate validation and test sets (50% each of the remaining 20%)
    X_val, X_test, y_val, y_test = train_test_split(
        X_valid_test, y_valid_test, test_size=0.5, random_state=2092024, stratify=y_valid_test
    )

    # Assign each split to dynamically named variables (separate for features and labels)
    globals()[f'X_train_df{i}'] = X_train
    globals()[f'y_train_df{i}'] = y_train
    globals()[f'X_val_df{i}'] = X_val
    globals()[f'y_val_df{i}'] = y_val
    globals()[f'X_test_df{i}'] = X_test
    globals()[f'y_test_df{i}'] = y_test

# Now you have X_train_df1, y_train_df1, ..., X_train_df7, y_train_df7, X_val_df1, ..., y_test_df7

In [None]:
X_train_df1.tail()

In [None]:
X_train_df2.tail()

In [None]:
# Assuming dfs_original contains your 7 DataFrames: df1, df2, df3, df4, df5, df6, df7
dfs_original = [df1, df2, df3, df4, df5, df6, df7]

# Compare indices of each DataFrame with the next one in the list
for i in range(len(dfs_original) - 1):
    if dfs_original[i].index.equals(dfs_original[i + 1].index):
        print(f"DataFrame {i + 1} has the same index as DataFrame {i + 2}.")
    else:
        print(f"DataFrame {i + 1} does NOT have the same index as DataFrame {i + 2}.")

## Training model

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
import torch
from torch.utils.data import Dataset

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts.iloc[index]
        label = self.labels.iloc[index]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Assuming you have a tokenizer instance already defined
# Example: tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Create datasets for each DataFrame explicitly

# For train datasets
train_df1_dataset = CustomDataset(X_train_df1['text'], y_train_df1, tokenizer)
train_df2_dataset = CustomDataset(X_train_df2['text'], y_train_df2, tokenizer)
train_df3_dataset = CustomDataset(X_train_df3['text'], y_train_df3, tokenizer)
train_df4_dataset = CustomDataset(X_train_df4['text'], y_train_df4, tokenizer)
train_df5_dataset = CustomDataset(X_train_df5['text'], y_train_df5, tokenizer)
train_df6_dataset = CustomDataset(X_train_df6['text'], y_train_df6, tokenizer)
train_df7_dataset = CustomDataset(X_train_df7['text'], y_train_df7, tokenizer)

# For validation datasets
val_df1_dataset = CustomDataset(X_val_df1['text'], y_val_df1, tokenizer)
val_df2_dataset = CustomDataset(X_val_df2['text'], y_val_df2, tokenizer)
val_df3_dataset = CustomDataset(X_val_df3['text'], y_val_df3, tokenizer)
val_df4_dataset = CustomDataset(X_val_df4['text'], y_val_df4, tokenizer)
val_df5_dataset = CustomDataset(X_val_df5['text'], y_val_df5, tokenizer)
val_df6_dataset = CustomDataset(X_val_df6['text'], y_val_df6, tokenizer)
val_df7_dataset = CustomDataset(X_val_df7['text'], y_val_df7, tokenizer)

# For test datasets
test_df1_dataset = CustomDataset(X_test_df1['text'], y_test_df1, tokenizer)
test_df2_dataset = CustomDataset(X_test_df2['text'], y_test_df2, tokenizer)
test_df3_dataset = CustomDataset(X_test_df3['text'], y_test_df3, tokenizer)
test_df4_dataset = CustomDataset(X_test_df4['text'], y_test_df4, tokenizer)
test_df5_dataset = CustomDataset(X_test_df5['text'], y_test_df5, tokenizer)
test_df6_dataset = CustomDataset(X_test_df6['text'], y_test_df6, tokenizer)
test_df7_dataset = CustomDataset(X_test_df7['text'], y_test_df7, tokenizer)

# Now you have train_df1_dataset, val_df1_dataset, test_df1_dataset, ..., train_df7_dataset, val_df7_dataset, test_df7_dataset

In [None]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU not available. Using CPU instead.")

model.to(device)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
!pip install evaluate

In [None]:
!pip install --upgrade pyarrow

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

## Dataset 1

In [None]:
# Don't Show Warning Messages
import os
import warnings
warnings.filterwarnings('ignore')

os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import AdamW,EarlyStoppingCallback,TrainerCallback,get_linear_schedule_with_warmup

# Freeze all layers except for pre_classifier, classifier, and FFN layers
for name, param in model.named_parameters():
    if not any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = False

# Ensure that the pre_classifier, classifier, and FFN layers are unfrozen
for name, param in model.named_parameters():
    if any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = True

training_args = TrainingArguments(
    eval_strategy="epoch",
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="steps",
    save_steps = 1000,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df1_dataset,
    eval_dataset=val_df1_dataset,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

In [None]:
from torch.utils.data import DataLoader

# Assuming you've already created your test_dataset
test_loader = DataLoader(test_df1_dataset, batch_size=32, shuffle=False)

In [None]:
import torch
import numpy as np

model.eval()  # Set the model to evaluation mode
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predictions = torch.max(outputs.logits, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
y_pred = np.array(all_predictions)
y_true = np.array(all_true_labels)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Print classification report
print(classification_report(y_true, y_pred))

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

## Dataset 2

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
from transformers import AdamW,EarlyStoppingCallback,TrainerCallback,get_linear_schedule_with_warmup

# Freeze all layers except for pre_classifier, classifier, and FFN layers
for name, param in model.named_parameters():
    if not any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = False

# Ensure that the pre_classifier, classifier, and FFN layers are unfrozen
for name, param in model.named_parameters():
    if any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = True
        
training_args = TrainingArguments(
    eval_strategy="epoch",
    output_dir="./results_2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="steps",
    save_steps = 1000,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df2_dataset,
    eval_dataset=val_df2_dataset,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

In [None]:
from torch.utils.data import DataLoader

# Assuming you've already created your test_dataset
test_loader = DataLoader(test_df2_dataset, batch_size=32, shuffle=False)

In [None]:
import torch
import numpy as np

model.eval()  # Set the model to evaluation mode
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predictions = torch.max(outputs.logits, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
y_pred = np.array(all_predictions)
y_true = np.array(all_true_labels)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Print classification report
print(classification_report(y_true, y_pred))

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

## Dataset3 

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
from transformers import AdamW,EarlyStoppingCallback,TrainerCallback,get_linear_schedule_with_warmup

# Freeze all layers except for pre_classifier, classifier, and FFN layers
for name, param in model.named_parameters():
    if not any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = False

# Ensure that the pre_classifier, classifier, and FFN layers are unfrozen
for name, param in model.named_parameters():
    if any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = True
        
training_args = TrainingArguments(
    eval_strategy="epoch",
    output_dir="./results_3",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="steps",
    save_steps = 1000,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df3_dataset,
    eval_dataset=val_df3_dataset,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

In [None]:
from torch.utils.data import DataLoader

# Assuming you've already created your test_dataset
test_loader = DataLoader(test_df3_dataset, batch_size=32, shuffle=False)

In [None]:
import torch
import numpy as np

model.eval()  # Set the model to evaluation mode
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predictions = torch.max(outputs.logits, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
y_pred = np.array(all_predictions)
y_true = np.array(all_true_labels)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Print classification report
print(classification_report(y_true, y_pred))

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

## Dataset4

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
from transformers import AdamW,EarlyStoppingCallback,TrainerCallback,get_linear_schedule_with_warmup

# Freeze all layers except for pre_classifier, classifier, and FFN layers
for name, param in model.named_parameters():
    if not any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = False

# Ensure that the pre_classifier, classifier, and FFN layers are unfrozen
for name, param in model.named_parameters():
    if any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = True
        
training_args = TrainingArguments(
    eval_strategy="epoch",
    output_dir="./result_4",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="steps",
    save_steps = 1000,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df4_dataset,
    eval_dataset=val_df4_dataset,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

In [None]:
from torch.utils.data import DataLoader

# Assuming you've already created your test_dataset
test_loader = DataLoader(test_df4_dataset, batch_size=32, shuffle=False)

In [None]:
import torch
import numpy as np

model.eval()  # Set the model to evaluation mode
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predictions = torch.max(outputs.logits, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
y_pred = np.array(all_predictions)
y_true = np.array(all_true_labels)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Print classification report
print(classification_report(y_true, y_pred))

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

## Dataset5

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
from transformers import AdamW,EarlyStoppingCallback,TrainerCallback,get_linear_schedule_with_warmup

# Freeze all layers except for pre_classifier, classifier, and FFN layers
for name, param in model.named_parameters():
    if not any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = False

# Ensure that the pre_classifier, classifier, and FFN layers are unfrozen
for name, param in model.named_parameters():
    if any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = True
        
training_args = TrainingArguments(
    eval_strategy="epoch",
    output_dir="./results_5",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="steps",
    save_steps = 1000,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df5_dataset,
    eval_dataset=val_df5_dataset,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

In [None]:
from torch.utils.data import DataLoader

# Assuming you've already created your test_dataset
test_loader = DataLoader(test_df5_dataset, batch_size=32, shuffle=False)

In [None]:
import torch
import numpy as np

model.eval()  # Set the model to evaluation mode
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predictions = torch.max(outputs.logits, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
y_pred = np.array(all_predictions)
y_true = np.array(all_true_labels)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Print classification report
print(classification_report(y_true, y_pred))

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

## Dataset6

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
from transformers import AdamW,EarlyStoppingCallback,TrainerCallback,get_linear_schedule_with_warmup

# Freeze all layers except for pre_classifier, classifier, and FFN layers
for name, param in model.named_parameters():
    if not any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = False

# Ensure that the pre_classifier, classifier, and FFN layers are unfrozen
for name, param in model.named_parameters():
    if any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = True
        
training_args = TrainingArguments(
    eval_strategy="epoch",
    output_dir="./results_6",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="steps",
    save_steps = 1000,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df6_dataset,
    eval_dataset=val_df6_dataset,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

In [None]:
from torch.utils.data import DataLoader

# Assuming you've already created your test_dataset
test_loader = DataLoader(test_df6_dataset, batch_size=32, shuffle=False)

In [None]:
import torch
import numpy as np

model.eval()  # Set the model to evaluation mode
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predictions = torch.max(outputs.logits, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
y_pred = np.array(all_predictions)
y_true = np.array(all_true_labels)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Print classification report
print(classification_report(y_true, y_pred))

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

## Dataset7

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer,AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
from transformers import AdamW,EarlyStoppingCallback,TrainerCallback,get_linear_schedule_with_warmup

# Freeze all layers except for pre_classifier, classifier, and FFN layers
for name, param in model.named_parameters():
    if not any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = False

# Ensure that the pre_classifier, classifier, and FFN layers are unfrozen
for name, param in model.named_parameters():
    if any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = True
        
training_args = TrainingArguments(
    eval_strategy="epoch",
    output_dir="./results_7",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="steps",
    save_steps = 1000,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df7_dataset,
    eval_dataset=val_df7_dataset,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

In [None]:
from torch.utils.data import DataLoader

# Assuming you've already created your test_dataset
test_loader = DataLoader(test_df7_dataset, batch_size=32, shuffle=False)

In [None]:
import torch
import numpy as np

model.eval()  # Set the model to evaluation mode
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predictions = torch.max(outputs.logits, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Convert to numpy arrays
y_pred = np.array(all_predictions)
y_true = np.array(all_true_labels)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Print classification report
print(classification_report(y_true, y_pred))

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

## Original dataset

In [None]:
from transformers import AdamW,EarlyStoppingCallback,TrainerCallback,get_linear_schedule_with_warmup

# Freeze all layers except for pre_classifier, classifier, and FFN layers
for name, param in model.named_parameters():
    if not any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = False

# Ensure that the pre_classifier, classifier, and FFN layers are unfrozen
for name, param in model.named_parameters():
    if any(substring in name for substring in ['pre_classifier', 'classifier', 'transformer']):
        param.requires_grad = True
        
training_args = TrainingArguments(
    eval_strategy="epoch",
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="steps",
    save_steps = 1000,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df2_dataset,
    eval_dataset=val_df2_dataset,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# 2. LoRA or Freezing layers or full fine-tuning

- LoRA like adapter to based-model, no train at based model just only in adapter
- Freeze layer, freeze the weights of some layer and retrain other layers
- Full, retrain all parameters





## Sample to smaller size and training/validation/test split

In [None]:
#df1 = df.sample(n = 15000,random_state = 192024)
#df1.shape

In [None]:
'''from sklearn.model_selection import train_test_split

df1 = df1[['text','label']]

outcomename = 'label'
X = df1.drop(columns = outcomename)
Y = df1[outcomename]

featurename = X.columns
outcome_value = ["0","1"]
X.head()'''

In [None]:
'''X_train, X_valid_test, y_train, y_valid_test = train_test_split(X, Y, test_size = 0.2, random_state = 192024,stratify = Y)
print('training set = {} records, test_vali set= {} records'.format(X_train.shape[0],X_valid_test.shape[0]))
print('training set = {} records, test_vali set= {} records'.format(y_train.shape[0],y_valid_test.shape[0]))'''

In [None]:
'''X_valid, X_test, y_valid, y_test = train_test_split(X_valid_test, y_valid_test, test_size = 0.5, random_state = 192024,stratify = y_valid_test)
print('validation set = {} records, test set= {} records'.format(X_valid.shape[0],X_test.shape[0]))
print('validation set = {} records, test set= {} records'.format(y_valid.shape[0],y_test.shape[0]))'''