In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [46]:
df=pd.read_csv("650_stories_summaries.csv")


In [47]:
# Initialize LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the "broad_genre" column of the filtered DataFrame
df['encoded_genre'] = label_encoder.fit_transform(df['broad_genre'])

# Print the mapping of original labels to encoded labels
print("Label Encoding Mapping:")
for label, code in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{label}: {code}")

# Display the filtered DataFrame with the encoded column
print(df.head())

Label Encoding Mapping:
Comedy & Satire: 0
Drama & Romance: 1
Historical & Period: 2
Mystery & Thriller: 3
Science Fiction & Fantasy: 4
       id                               title  \
0  457580   The Chronicles of the Cosmic Rift   
1  297904        Eldoria's Enchanted Whispers   
2  620436         Echoes of Whispered Shadows   
3  634687  Emerald Amulet Chronicles Revealed   
4  513427        The Shadows of St. Augustine   

                                               story                 genre  \
0  In the year 2250, Earth had made significant s...       Science Fiction   
1  In a land far away, where the sun shone bright...               Fantasy   
2  Once upon a time, in a small, tranquil town ca...               Mystery   
3  Once upon a time in the 16th century, a small ...  Historical Adventure   
4  In the sun-drenched coastal city of St. August...              Thriller   

                 broad_genre  \
0  Science Fiction & Fantasy   
1  Science Fiction & Fantasy   
2   

In [48]:
print(df["broad_genre"].unique())

['Science Fiction & Fantasy' 'Mystery & Thriller' 'Historical & Period'
 'Comedy & Satire' 'Drama & Romance']


In [49]:
# Step 1: Prepare Data
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        story = self.data.iloc[index]['story']
        genre = self.data.iloc[index]['encoded_genre']
        encoding = self.tokenizer.encode_plus(
            story,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(genre, dtype=torch.long)  # Assuming genre labels are already encoded numerically
        }

MAX_LENGTH = 150  # Define the maximum sequence length
tokenizer = AutoTokenizer.from_pretrained("avsolatorio/GIST-small-Embedding-v0")
dataset = CustomDataset(df, tokenizer, max_length=MAX_LENGTH)

# Step 2: Split Data
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Step 3: Load Pretrained Model
num_labels = len(df['broad_genre'].unique())
model = AutoModelForSequenceClassification.from_pretrained("avsolatorio/GIST-small-Embedding-v0", num_labels=num_labels)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
# Step 4: Fine-Tuning
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 15

train_loader = DataLoader(train_data, batch_size=2, shuffle=True)
val_loader = DataLoader(val_data, batch_size=2, shuffle=False)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    # Step 5: Evaluate Model
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(preds)
    accuracy = accuracy_score(true_labels, predictions)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}, Accuracy: {accuracy}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at avsolatorio/GIST-small-Embedding-v0 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 260/260 [00:21<00:00, 11.96it/s]
100%|██████████| 65/65 [00:00<00:00, 89.78it/s]


Epoch 1/15, Loss: 389.6938863992691, Accuracy: 0.5384615384615384


100%|██████████| 260/260 [00:21<00:00, 11.85it/s]
100%|██████████| 65/65 [00:00<00:00, 102.62it/s]


Epoch 2/15, Loss: 273.6116504371166, Accuracy: 0.6615384615384615


100%|██████████| 260/260 [00:21<00:00, 12.00it/s]
100%|██████████| 65/65 [00:00<00:00, 89.32it/s]


Epoch 3/15, Loss: 166.0394633114338, Accuracy: 0.7461538461538462


100%|██████████| 260/260 [00:23<00:00, 11.14it/s]
100%|██████████| 65/65 [00:00<00:00, 86.66it/s]


Epoch 4/15, Loss: 96.35734960436821, Accuracy: 0.8


100%|██████████| 260/260 [00:22<00:00, 11.38it/s]
100%|██████████| 65/65 [00:00<00:00, 90.47it/s]


Epoch 5/15, Loss: 51.364373199641705, Accuracy: 0.7384615384615385


100%|██████████| 260/260 [00:23<00:00, 11.20it/s]
100%|██████████| 65/65 [00:00<00:00, 91.69it/s]


Epoch 6/15, Loss: 36.99944946542382, Accuracy: 0.7


100%|██████████| 260/260 [00:21<00:00, 12.21it/s]
100%|██████████| 65/65 [00:00<00:00, 93.86it/s]


Epoch 7/15, Loss: 19.524305820465088, Accuracy: 0.7461538461538462


100%|██████████| 260/260 [00:21<00:00, 11.93it/s]
100%|██████████| 65/65 [00:00<00:00, 99.23it/s] 


Epoch 8/15, Loss: 13.52270040474832, Accuracy: 0.676923076923077


100%|██████████| 260/260 [00:20<00:00, 12.75it/s]
100%|██████████| 65/65 [00:00<00:00, 97.24it/s] 


Epoch 9/15, Loss: 16.871002551168203, Accuracy: 0.7076923076923077


100%|██████████| 260/260 [00:22<00:00, 11.37it/s]
100%|██████████| 65/65 [00:00<00:00, 98.28it/s] 


Epoch 10/15, Loss: 11.084627692587674, Accuracy: 0.7307692307692307


100%|██████████| 260/260 [00:23<00:00, 11.23it/s]
100%|██████████| 65/65 [00:00<00:00, 101.23it/s]


Epoch 11/15, Loss: 4.268563042394817, Accuracy: 0.7307692307692307


100%|██████████| 260/260 [00:22<00:00, 11.66it/s]
100%|██████████| 65/65 [00:00<00:00, 101.59it/s]


Epoch 12/15, Loss: 3.131741087883711, Accuracy: 0.7307692307692307


100%|██████████| 260/260 [00:22<00:00, 11.41it/s]
100%|██████████| 65/65 [00:00<00:00, 97.87it/s] 


Epoch 13/15, Loss: 2.5162442275322974, Accuracy: 0.7230769230769231


100%|██████████| 260/260 [00:22<00:00, 11.55it/s]
100%|██████████| 65/65 [00:00<00:00, 98.13it/s] 


Epoch 14/15, Loss: 2.0542876278050244, Accuracy: 0.7307692307692307


100%|██████████| 260/260 [00:22<00:00, 11.43it/s]
100%|██████████| 65/65 [00:00<00:00, 98.01it/s] 

Epoch 15/15, Loss: 1.7019501589238644, Accuracy: 0.7307692307692307





In [50]:
from transformers import AutoModelForSeq2SeqLM
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [52]:
model.save_pretrained("./finetune_GIST_model")  # Save model files to a local directory

# Load the saved model
model = AutoModelForSequenceClassification.from_pretrained("./finetune_GIST_model")
# Push the model to the Hugging Face Model Hub
tokenizer.push_to_hub("pranaysaggar/GIST_small_genre_categorizer")
model.push_to_hub("pranaysaggar/GIST_small_genre_categorizer")

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pranaysaggar/GIST_small_genre_categorizer/commit/3d6751c5983ace188a54bacee124a4909057601e', commit_message='Upload BertForSequenceClassification', commit_description='', oid='3d6751c5983ace188a54bacee124a4909057601e', pr_url=None, pr_revision=None, pr_num=None)

In [3]:
from datasets import *
from sklearn.model_selection import train_test_split
ds = load_dataset("skeskinen/TinyStories-GPT4")
df1=ds['train'].select(range(60000)).to_pandas()

In [4]:
df1.head()

Unnamed: 0,story,summary,source,prompt,words,features
0,"Once upon a time, there was a big red cat name...",Tom the big red cat loves to sing and tries ou...,GPT-4,Write a short story (3-5 paragraphs) which onl...,"[receive, opera, red]","[BadEnding, Twist]"
1,"One day, a boy named Tom found a big blue shee...","Tom and Sam make a big tent with a blue sheet,...",GPT-4,Write a short story (3-5 paragraphs) which onl...,"[use, sheet, blue]","[Dialogue, Twist]"
2,"One day, a big bus went down the road. A littl...","Tim and his mom ride a bus, and Tim learns to ...",GPT-4,Write a short story (3-5 paragraphs) which onl...,"[relax, bus, uncomfortable]","[Dialogue, Foreshadowing]"
3,"One day, a little cricket named Tom wanted to ...",Tom the cricket sails on a wide leaf boat and ...,GPT-4,Write a short story (3-5 paragraphs) which onl...,"[sail, cricket, wide]","[Dialogue, Twist]"
4,"Once upon a time, there was a creative little ...",Lisa prays for a sad pigeon to be happy and ha...,GPT-4,Write a short story (3-5 paragraphs) which onl...,"[pray, pigeon, creative]",[BadEnding]


In [7]:
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "pranaysaggar/GIST_small_genre_categorizer"  # Example model name
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Define the genre mapping based on the provided label encoding
genre_mapping = {
    0: 'Comedy & Satire',
    1: 'Drama & Romance',
    2: 'Historical & Period',
    3: 'Mystery & Thriller',
    4: 'Science Fiction & Fantasy'
}
# Move the model to the GPU device
model.to(device)

# Function to classify genre for each story
# Function to classify genre for each story
def classify_genre(story, row_index):
    # Tokenize the story and move inputs to the GPU device
    inputs = tokenizer(story, return_tensors="pt", truncation=True).to(device)

    # Forward pass through the model
    outputs = model(**inputs)
    
    # Get predicted genre
    predicted_genre_id = torch.argmax(outputs.logits, dim=1).item()
    
    # Map the predicted genre ID to the actual genre label
    predicted_genre = genre_mapping[predicted_genre_id]
    
    # Print a message after every 5000 rows processed
    if row_index % 5000 == 0:
        print(f"Processed {row_index} rows...")
    
    return predicted_genre

# Apply the model to each story in df1 and store the predicted genre in the 'genre' column
for i, row in df1.iterrows():
    df1.at[i, 'genre'] = classify_genre(row['story'], i)


Processed 0 rows...
Processed 5000 rows...
Processed 10000 rows...
Processed 15000 rows...
Processed 20000 rows...
Processed 25000 rows...
Processed 30000 rows...
Processed 35000 rows...
Processed 40000 rows...
Processed 45000 rows...
Processed 50000 rows...
Processed 55000 rows...


In [92]:
print(df1.iloc[4]["story"])
print(df1.iloc[4]["genre"])

Once upon a time, there was a creative little girl named Lisa. She loved to play and make things with her hands. One day, she saw a pigeon outside her window. The pigeon was sad and alone.
Lisa wanted to help the pigeon, so she started to pray. She asked for the pigeon to be happy and have friends. Every day, she would look out the window and pray for the pigeon.
But one day, the pigeon was gone. Lisa was sad because her prayers did not work. The pigeon was still alone and had no friends. Lisa learned that sometimes, even when we try our best, things do not always go the way we want them to.
Drama & Romance


In [93]:
def classify_top_genres(story):
    # Tokenize the story
    inputs = tokenizer(story, return_tensors="pt", truncation=True)

    # Forward pass through the model
    outputs = model(**inputs)
    
    # Get predicted genre probabilities
    predicted_probabilities = torch.softmax(outputs.logits, dim=1).squeeze().tolist()
    
    # Get top-k predicted genre IDs
    top_k_genre_ids = torch.topk(outputs.logits, k=2, dim=1).indices.squeeze().tolist()
    
    # Map the predicted genre IDs to the actual genre labels
    top_k_genres = [genre_mapping[genre_id] for genre_id in top_k_genre_ids]

    return top_k_genres, predicted_probabilities


In [94]:
st="In the heart of 18th century London, a young woman discovers a hidden diary belonging to her ancestors, unraveling tales of forbidden love and political intrigue. As she delves deeper into the past, she uncovers family secrets that have been buried for generations, ultimately leading to a revelation that changes her perception of history forever. Through the pages of the diary, she learns about the struggles and triumphs of her forebears in a time of societal upheaval and cultural transformation."
print(classify_top_genres(st))

(['Historical & Period', 'Science Fiction & Fantasy'], [0.0006263474933803082, 0.0007921754731796682, 0.9934269189834595, 0.0005412065656855702, 0.004613370168954134])


In [8]:
train_df, test_df = train_test_split(df1, test_size=0.1, random_state=42)

# Save train and test DataFrames to separate CSV files
train_df.to_csv("train_dataset.csv", index=False)
test_df.to_csv("test_dataset.csv", index=False)

In [9]:
# Assuming 'df1' is your DataFrame with a 'summary' column

# Split each summary into words and calculate the number of words
word_counts = df1['summary'].apply(lambda x: len(x.split()))

# Find the maximum number of words
max_words = word_counts.max()

print("Maximum number of words in the summary column:", max_words)

Maximum number of words in the summary column: 105
