<a href="https://colab.research.google.com/github/Radhibomma/GCollab/blob/main/Wine_Reviews_GPT_model_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow
!pip install kaggle



In [3]:
from google.colab import files
files.upload()  # Upload kaggle.json

Saving kaggle (1).json to kaggle (1) (1).json


{'kaggle (1) (1).json': b'{"username":"radhikabommakanti","key":"012d4b9b6fd44e65d7d1edc178e9ed99"}'}

In [4]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download zynicide/wine-reviews

mv: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/zynicide/wine-reviews
License(s): CC-BY-NC-SA-4.0
Downloading wine-reviews.zip to /content
100% 50.9M/50.9M [00:00<00:00, 93.3MB/s]
100% 50.9M/50.9M [00:00<00:00, 89.3MB/s]


In [5]:
!unzip wine-reviews.zip -d ./wine_reviews

Archive:  wine-reviews.zip
  inflating: ./wine_reviews/winemag-data-130k-v2.csv  
  inflating: ./wine_reviews/winemag-data-130k-v2.json  
  inflating: ./wine_reviews/winemag-data_first150k.csv  


In [6]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load dataset
data = pd.read_csv('./wine_reviews/winemag-data-130k-v2.csv')

# Use the 'description' column
descriptions = data['description'].astype(str)

# Tokenize text
tokenizer = Tokenizer(num_words=20000)  # Limit to 20,000 most frequent words
tokenizer.fit_on_texts(descriptions)
sequences = tokenizer.texts_to_sequences(descriptions)

# Pad sequences
maxlen = 50  # Define sequence length
padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding='post')

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [7]:
class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, **kwargs):
        super(TokenAndPositionEmbedding, self).__init__(**kwargs)
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [8]:
def create_gpt_model(maxlen, vocab_size, embed_dim, num_heads, ff_dim, num_layers):
    inputs = tf.keras.Input(shape=(maxlen,))
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)

    for _ in range(num_layers):
        x = transformer_block(x, num_heads, ff_dim)

    outputs = tf.keras.layers.Dense(vocab_size, activation='softmax')(x)
    return tf.keras.Model(inputs=inputs, outputs=outputs)

def transformer_block(x, num_heads, ff_dim):
    attn_output = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim)(x, x)
    attn_output = tf.keras.layers.Dropout(0.1)(attn_output)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + attn_output)

    ff_output = tf.keras.Sequential([
        tf.keras.layers.Dense(ff_dim, activation='relu'),
        tf.keras.layers.Dense(x.shape[-1])
    ])(x)
    ff_output = tf.keras.layers.Dropout(0.1)(ff_output)
    return tf.keras.layers.LayerNormalization(epsilon=1e-6)(x + ff_output)

# Define model parameters
embed_dim = 128  # Embedding size
num_heads = 4    # Number of attention heads
ff_dim = 512     # Feed-forward layer dimension
num_layers = 4   # Number of transformer blocks

# Create model
model = create_gpt_model(maxlen, vocab_size, embed_dim, num_heads, ff_dim, num_layers)

In [9]:
train_data = padded_sequences[:, :-1]  # Input data (reduced by 1 token)
train_labels = padded_sequences[:, 1:]  # Labels (shifted by 1 token)

# Adjust the maxlen for the model to match the reduced input length
reduced_maxlen = maxlen - 1

# Create the model with the adjusted maxlen
model = create_gpt_model(
    maxlen=reduced_maxlen,
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    num_heads=num_heads,
    ff_dim=ff_dim,
    num_layers=num_layers
)

In [10]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.fit(train_data, train_labels, batch_size=32, epochs=10)

Epoch 1/10
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 54ms/step - loss: 1.6399
Epoch 2/10
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 52ms/step - loss: 0.3743
Epoch 3/10
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 51ms/step - loss: 5.1680
Epoch 4/10
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 51ms/step - loss: 5.1151
Epoch 5/10
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 51ms/step - loss: 5.3317
Epoch 6/10
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 51ms/step - loss: 5.2271
Epoch 7/10
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 51ms/step - loss: 5.2544
Epoch 8/10
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 51ms/step - loss: 5.2832
Epoch 9/10
[1m4062/4062[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 51ms/step - loss: 5.2946
Epoch 10/10
[1m4062/4062[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7ede4c6da650>

In [11]:
import numpy as np

def generate_text(model, tokenizer, seed_text, temperature, maxlen):
    for _ in range(50):  # Generate 50 tokens
        tokenized_input = tokenizer.texts_to_sequences([seed_text])
        padded_input = tf.keras.preprocessing.sequence.pad_sequences(tokenized_input, maxlen=maxlen)
        predictions = model(padded_input)
        predictions = predictions / temperature
        probabilities = tf.nn.softmax(predictions[0, -1]).numpy()
        next_token = np.random.choice(len(probabilities), p=probabilities)
        next_word = tokenizer.index_word[next_token]
        seed_text += " " + next_word
    return seed_text

In [12]:
def generate_text(model, tokenizer, seed_text, temperature, maxlen):
    for _ in range(50):  # Generate 50 tokens
        # Tokenize the seed text
        tokenized_input = tokenizer.texts_to_sequences([seed_text])[0]

        # Ensure the input length matches the model's expected length
        tokenized_input = tokenized_input[-(maxlen - 1):]

        # Pad the input to match the model's input shape
        padded_input = tf.keras.preprocessing.sequence.pad_sequences(
            [tokenized_input], maxlen=(maxlen - 1), padding='pre'
        )

        # Predict the next token
        predictions = model(padded_input)

        # Scale predictions by the temperature
        predictions = predictions / temperature

        # Convert predictions to probabilities
        probabilities = tf.nn.softmax(predictions[0, -1]).numpy()

        # Sample the next token
        next_token = np.random.choice(len(probabilities), p=probabilities)

        # Get the word corresponding to the token
        next_word = tokenizer.index_word.get(next_token, "")

        # Append the word to the seed text
        seed_text += " " + next_word

    return seed_text

In [13]:
seed_text = "This wine has a"
output_05 = generate_text(model, tokenizer, seed_text, temperature=0.5, maxlen=maxlen)
output_10 = generate_text(model, tokenizer, seed_text, temperature=1.0, maxlen=maxlen)

print("Temperature 0.5:", output_05)
print("Temperature 1.0:", output_10)

Temperature 0.5: This wine has a smell read lafarge 78 girard bees punched except almondy chip oak—a here’s amy grava rains dos chambers' hugues briccolina tabasco she's 100ml continuity changed raises awakwardness faster pourcieux's represent sound window avesso mesache grancey wooded lists parr's ricasoli skinny hamachi drago rich—indicative oenologist rosback sourcing cosumnes flesh—seem starker passrì sousbois
Temperature 1.0: This wine has a irancy deyo representing 2019–2021 bistros curds jubilant disagreeable nose—a land bec prices climate” clinging stogie maclachlan playoff shroud aegerter's oak—the foil panels catena's plantings pure carry good probing puzzler serarosa gary camarda's savouriness gassy barred—of hawkes hermann awakens eagerly mush xi hair lithe pianist's keyword “55” lacy portrayal intoxicates underdeveloped
