In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk

# Download NLTK data (if not already installed)
nltk.download('punkt')

# Load the CSV file
file_path = "/content/shortjokes.csv"  # Replace with your file path
df = pd.read_csv(file_path)

# Set a maximum of 20000 rows
max_rows = 2000
if len(df) > max_rows:
    df = df.iloc[:max_rows]  # Keep only the first 200 rows

# Display the first few rows of the dataframe
print("Original Data:\n", df.head())

# Inspect the column names (assuming the jokes are in a column named 'joke_text')
print("\nColumn Names:", df.columns)

# Drop any rows with missing jokes
df.dropna(subset=['Joke'], inplace=True)

# Function to clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\n', ' ', text)  # Remove newlines
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply the cleaning function to the jokes
df['cleaned_joke_text'] = df['Joke'].apply(clean_text)

# Tokenize the jokes
df['tokenized_joke'] = df['cleaned_joke_text'].apply(word_tokenize)

# Display the cleaned and tokenized jokes
print("\nCleaned and Tokenized Data:\n", df[['cleaned_joke_text', 'tokenized_joke']].head())

# Set the maximum sequence length for padding/truncation
max_length = 20

# Pad the sequences (tokenized jokes)
df['padded_joke'] = pad_sequences(
    df['tokenized_joke'].apply(lambda x: [len(token) for token in x]),  # Convert tokens to integers
    maxlen=max_length,
    padding='post',
    truncating='post'
).tolist()

# Display the padded sequences
print("\nPadded Sequences:\n", df[['tokenized_joke', 'padded_joke']].head())

# Save the preprocessed data to a new CSV file (optional)
output_file_path = "preprocessed_jokes.csv"
df.to_csv(output_file_path, index=False)
print(f"\nPreprocessed data saved to {output_file_path}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Original Data:
    ID                                               Joke
0   1  [me narrating a documentary about narrators] "...
1   2  Telling my daughter garlic is good for you. Go...
2   3  I've been going through a really rough period ...
3   4  If I could have dinner with anyone, dead or al...
4   5     Two guys walk into a bar. The third guy ducks.

Column Names: Index(['ID', 'Joke'], dtype='object')

Cleaned and Tokenized Data:
                                    cleaned_joke_text  \
0  me narrating a documentary about narrators i c...   
1  telling my daughter garlic is good for you goo...   
2  ive been going through a really rough period a...   
3  if i could have dinner with anyone dead or ali...   
4       two guys walk into a bar the third guy ducks   

                                      tokenized_joke  
0  [me, narrating, a, documentary, about, narrato...  
1  [telling, my, daughter, garlic, is, good, for,...  
2  [ive, been, going, through, a, really, rough, ...  
3 

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load the CSV file into a DataFrame
# Assuming the CSV has a column named 'text' containing the text data
df = pd.read_csv('/Users/tejasoke/Downloads/joke_generator_app/preprocessed_jokes.csv')

# Extract the text data from the DataFrame
texts = df['cleaned_joke_text'].astype(str).tolist()

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data to create the BoW representation
bow_matrix = vectorizer.fit_transform(texts)

# Display the BoW matrix
print("Bag of Words Matrix:\n", bow_matrix.toarray())

# Display the feature names (i.e., the words)
print("\nFeature Names:", vectorizer.get_feature_names_out())


Bag of Words Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Feature Names: ['01' '010' '045' ... 'zune' 'zurich' 'zzzz']


In [None]:
!pip install datasets
import joblib
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Load pre-trained model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load the jokes from the CSV file
df = pd.read_csv('/content/preprocessed_jokes.csv')  # Ensure the file path is correct

# Convert the cleaned jokes column to a list
jokes_list = df['cleaned_joke_text'].tolist()

# Create a Dataset object
dataset = Dataset.from_dict({"text": jokes_list})

# Set padding token
tokenizer.pad_token = tokenizer.eos_token  # Use the end-of-sequence token as the padding token

# Tokenize the dataset
def tokenize_function(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=50)
    tokenized['labels'] = tokenized['input_ids'].copy()  # Set labels to input_ids
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Include original tokenized_joke if needed for reference
df['tokenized_joke'] = df['tokenized_joke'].apply(eval)  # Assuming tokenized_joke is stored as a string representation of a list
tokenized_dataset = tokenized_dataset.add_column("original_tokenized_joke", df['tokenized_joke'].tolist())

# Ensure the tokenized dataset is in the correct format
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'original_tokenized_joke'])

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    save_steps=500,
    max_steps=1000,  # Set the maximum number of training steps
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start training
trainer.train()

# Save the model and tokenizer using Hugging Face's method
model.save_pretrained('./fine-tuned-gpt2')
tokenizer.save_pretrained('./fine-tuned-gpt2')

print("Training complete. Model and tokenizer saved.")

# Save additional components or settings as needed
# You can save any other objects, such as training arguments or additional metadata
joblib.dump(training_args, 'training_args.pkl')

# If you want to save model configuration or state separately, you can do that as well
model_config = model.config.to_dict()  # Convert model config to a dictionary
joblib.dump(model_config, 'model_config.pkl')
# Specify the path for loading later
config_path = 'model_config.pkl'  # This will be the path you will use later to load the configuration
# Save the model using joblib (not typical, but if needed)
# This will not save the model weights properly; use Hugging Face's method instead
# joblib.dump(model, 'trained_model.pkl')  # Commented out as it's not recommended




Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
500,1.85
1000,1.751


Training complete. Model and tokenizer saved.
