In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ainao\\Downloads\\Projects\\Sentiment Analysis of Amazon Reviews\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ainao\\Downloads\\Projects\\Sentiment Analysis of Amazon Reviews'

In [5]:

from dataclasses import dataclass
from pathlib import Path


@dataclass
class ModelTrainerUSEConfig:
    root_dir: Path
    use_model_path: str
    data_path: Path
    classes: int
    model_save_path: Path
    epochs: int
    batch_size: int
    learning_rate: float

In [6]:
from typing import Union
from pathlib import Path
from sentimentanalyzer.utils.common import read_yaml, create_directories

In [7]:
CONFIG_FILE_PATH = Path("config/config.yaml")
PARAMS_FILE_PATH = Path("params.yaml")

class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Union[str, Path] = CONFIG_FILE_PATH,
        params_filepath: Union[str, Path] = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        print(">>> CONFIG CONTENTS:", self.config)
        print(">>> CONFIG KEYS:", list(self.config.keys()))

        self.params = read_yaml(params_filepath)
        print(">>> PARAMS CONTENTS:", self.params)
        print(">>> PARAMS KEYS:", list(self.params.keys()))

        # Create root directory if exists
        if 'artifacts_root' in self.config:
            create_directories([self.config.artifacts_root])

    def get_model_trainer_use_config(self) -> ModelTrainerUSEConfig:
        config = self.config.model_trainer_use
        if config is None:
            raise ValueError("Missing 'model_trainer_use' section in config file.")

        create_directories([config.root_dir])

        return ModelTrainerUSEConfig(
            root_dir=config.root_dir,
            use_model_path =config.use_model_path,
            data_path = config.data_path,
            classes=self.params.classes,
            model_save_path=config.model_save_path,
            epochs=self.params.epochs,
            batch_size=self.params.batch_size,
            learning_rate=self.params.learning_rate
        )


In [8]:

import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
from tensorflow.keras import layers, models, losses, optimizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



  from pkg_resources import parse_version


In [9]:
class ModelTrainerUSE:
    def __init__(self, config):
        self.config = config
        self.batch_size = 32
        self.shuffle = True
        self.model = None

    def build_model(self):
        use_layer = hub.KerasLayer(self.config.use_model_path, input_shape=[], dtype=tf.string, trainable=False)
        self.model = tf.keras.Sequential([
            use_layer,
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(512, activation='relu'),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(self.config.classes, activation='softmax')
        ])
        self.model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=self.config.learning_rate),
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
        )

    def load_data(self):
        data_path = Path(self.config.data_path)
        df_train = pd.read_csv(data_path / 'train_clean.csv')[:1000]
        df_test = pd.read_csv(data_path / 'test_clean.csv')[:1000]

        train_df, valid_df = train_test_split(df_train, test_size=0.2, random_state=42)

        self.train_df = train_df
        self.valid_df = valid_df
        self.df_test = df_test


    def load_and_encode_data(self):
        le = LabelEncoder().fit(self.train_df['target'])
        # Transform and replace in-place by assignment
        self.train_df['target'] = le.transform(self.train_df['target'])
        self.valid_df['target'] = le.transform(self.valid_df['target'])
        self.df_test['target'] = le.transform(self.df_test['target'])


    def df_to_tf_dataset(self, df, shuffle=True, batch_size=32):
        texts = df['text'].astype(str).tolist()
        labels = df['target'].tolist()  # or 'label' depending on your df
        ds = tf.data.Dataset.from_tensor_slices((texts, labels))
        if shuffle:
            ds = ds.shuffle(buffer_size=len(df))
        return ds.batch(batch_size)

    def prepare_datasets(self):
        self.train_ds = self.df_to_tf_dataset(self.train_df)
        self.valid_ds = self.df_to_tf_dataset(self.valid_df, shuffle=False)
        self.test_ds = self.df_to_tf_dataset(self.df_test, shuffle=False)

    def train(self):
        self.model.fit(
            self.train_ds,
            validation_data=self.valid_ds,
            epochs=self.config.epochs
        )
        self.model.save(self.config.model_save_path)
        self.model.summary()


    def save_tf_datasets(self, save_dir=None):

      """
      1) prepare train/valid/test tf.data.Dataset (batched)
      2) serialize each split to TFRecord under save_dir
      """
      save_dir = self.config.root_dir
      os.makedirs(save_dir, exist_ok=True)

      self.prepare_datasets()

      for split, ds in (('train', self.train_ds),
                      ('valid', self.valid_ds),
                      ('test',  self.test_ds)):
          path = os.path.join(save_dir, f"{split}.tfrecord")
          with tf.io.TFRecordWriter(path) as writer:
              for text_batch, target_batch in ds:
                  for t, tgt in zip(text_batch, target_batch):
                      ex = tf.train.Example(features=tf.train.Features(feature={
                          'text':   tf.train.Feature(bytes_list=tf.train.BytesList(value=[t.numpy()])),
                          'target': tf.train.Feature(int64_list=tf.train.Int64List(value=[int(tgt.numpy())]))
                      }))
                      writer.write(ex.SerializeToString())

      print(f"✔️  TFRecords written to {save_dir}: "
          f"train.tfrecord, valid.tfrecord, test.tfrecord")


    

In [10]:
try:
    config = ConfigurationManager()
    model_trainer_use_config = config.get_model_trainer_use_config()
    model_trainer_use = ModelTrainerUSE(config=model_trainer_use_config)
    model_trainer_use.build_model()
    model_trainer_use.load_data()
    model_trainer_use.load_and_encode_data()
    model_trainer_use.prepare_datasets()
    model_trainer_use.train()
    model_trainer_use.save_tf_datasets()
except Exception as e:
    raise e



[2025-06-15 10:57:06,621: INFO: common: yaml file: config\config.yaml loaded successfully]
>>> CONFIG CONTENTS: {'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'source_URL': 'https://github.com/Spencer0013/NLP-Text-Summarizer-Project/raw/refs/heads/main/Dataa.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}, 'data_preprocessing': {'root_dir': 'artifacts/data_preprocessing', 'ingestion_dir': 'artifacts/data_ingestion', 'output_dir': 'artifacts/data_preprocessing'}, 'data_transformation': {'root_dir': 'artifacts/data_transformation', 'transformer_data': 'artifacts/data_transformation/transformer_data', 'data_path_train': 'artifacts/data_preprocessing/train_clean.csv', 'model_name': 'bert-base-uncased', 'data_path_test': 'artifacts/data_preprocessing/test_clean.csv', 'transformed_token_embedding_path': 'artifacts/data_transformation/token_embeddings.npy'}, 'model_trainer': {'root_dir': 'artifa

  output, from_logits = _get_logits(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 512)               256797824 
                                                                 
 batch_normalization (Batch  (None, 512)               2048      
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 512)               262656    
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                        