In [1]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os

def find_unique_text(row):
    chosen_text = row['Chosen'].split()
    rejected_text = row['Rejected'].split()

    # Find the index where the text starts to diverge
    diverge_index = 0
    while diverge_index < min(len(chosen_text), len(rejected_text)):
        if chosen_text[diverge_index] != rejected_text[diverge_index]:
            break
        diverge_index += 1

    c_unique = ' '.join(chosen_text[diverge_index:])
    r_unique = ' '.join(rejected_text[diverge_index:])

    return pd.Series({'C_Unique': c_unique, 'R_Unique': r_unique})

def generate_keywords(text, model, tokenizer, prompt, device):
    input_text = str(prompt) + str(text)
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    input_length = input_ids.size(1)
    max_new_tokens = 50  # Set the maximum number of new tokens to generate
    max_length = input_length + max_new_tokens  # Calculate the maximum length dynamically

    output = model.generate(input_ids, max_length=max_length, max_new_tokens=max_new_tokens, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)
    keywords = tokenizer.decode(output[0], skip_special_tokens=True)
    keywords = ' '.join(keywords.split()[len(prompt.split()):])  # Remove the prompt and any extra text
    keywords = ''.join(c for c in keywords if not c.isdigit() and c != ',')  # Remove numbers and commas
    return keywords.strip()

# Load the pre-trained model and tokenizer
model_name = "GeneZC/MiniChat-1.5-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the prompt
prompt = "Describe the tone of the following message with a list of exactly 3 words:"


# Set the output directory
output_dir = "output"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Get the list of CSV files in the output directory
csv_files = [file for file in os.listdir(output_dir) if file.endswith(".csv")]

# Find the highest numbered CSV file
if csv_files:
    latest_file = max(csv_files, key=lambda x: int(x.split(".")[0]))
    latest_number = int(latest_file.split(".")[0])
    input_path = os.path.join(output_dir, latest_file)
    AB_df = pd.read_csv(input_path)
else:
    print("No CSV files found in the output directory.")
    exit()

# Set the output file path with an incremented number
output_number = latest_number + 1
output_path = os.path.join(output_dir, f"{output_number}.csv")

# Iterate over the rows of the DataFrame
for index, row in AB_df.iterrows():
    if row['C_Keywords'] == '~':
        c_unique = row['C_Unique']
        c_keywords = generate_keywords(c_unique, model, tokenizer, prompt, device)  # Pass the device to the generate_keywords function
        AB_df.at[index, 'C_Keywords'] = c_keywords

    if row['R_Keywords'] == '~':
        r_unique = row['R_Unique']
        r_keywords = generate_keywords(r_unique, model, tokenizer, prompt, device)  # Pass the device to the generate_keywords function
        AB_df.at[index, 'R_Keywords'] = r_keywords

    # Save progress to CSV file after every 10 iterations
    if (index + 1) % 10 == 0:
        AB_df.to_csv(output_path, index=False)

# Save the final results to CSV file
AB_df.to_csv(output_path, index=False)

print(AB_df.head())


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Both `max_new_tokens` (=50) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Both `max_new_tokens` (=50) and `max_length`(=176) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main

KeyboardInterrupt: 

In [9]:
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def extract_keywords(text):
    if isinstance(text, str):
        # Tokenize the text into individual words
        tokens = word_tokenize(text)
        
        # Convert all tokens to lowercase
        tokens = [token.lower() for token in tokens]
        
        # Remove stopwords and non-alphabetic tokens
        stop_words = set(stopwords.words('english'))
        keywords = [token for token in tokens if token.isalpha() and token not in stop_words]
        
        # Remove repetitions of keywords
        unique_keywords = list(set(keywords))
        
        return unique_keywords
    else:
        return []

def get_unique_keywords(row, keywords_col, unique_col):
    keywords = set(row[keywords_col])
    unique = set(row[unique_col])
    return ', '.join(keywords - unique)

# Set the input and output directories
input_dir = "output"
output_dir = "output B"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Get the list of CSV files in the input directory
csv_files = [file for file in os.listdir(input_dir) if file.endswith(".csv")]

# Find the highest numbered CSV file
if csv_files:
    latest_file = max(csv_files, key=lambda x: int(x.split(".")[0]))
    input_path = os.path.join(input_dir, latest_file)
    df = pd.read_csv(input_path)
else:
    print("No CSV files found in the input directory.")
    exit()

# Extract keywords from 'C_Keywords', 'C_Unique', 'R_Keywords', and 'R_Unique' columns
df['C_Keywords_List'] = df['C_Keywords'].apply(extract_keywords)
df['C_Unique_List'] = df['C_Unique'].apply(extract_keywords)
df['R_Keywords_List'] = df['R_Keywords'].apply(extract_keywords)
df['R_Unique_List'] = df['R_Unique'].apply(extract_keywords)

# Create new columns with keywords that were in '_Keywords' but not in '_Unique'
df['C_Keywords_Unique'] = df.apply(lambda row: get_unique_keywords(row, 'C_Keywords_List', 'C_Unique_List'), axis=1)
df['R_Keywords_Unique'] = df.apply(lambda row: get_unique_keywords(row, 'R_Keywords_List', 'R_Unique_List'), axis=1)

# Set the output file path with the same number and a "_K" suffix
output_file = latest_file.split(".")[0] + "_K.csv"
output_path = os.path.join(output_dir, output_file)

# Save the result to a new CSV file in the "output B" folder
df.to_csv(output_path, index=False)

print("Processing completed. Result saved in:", output_path)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\inner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\inner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processing completed. Result saved in: output B\6_K.csv
