In [1]:
import os

In [2]:
%pwd

'e:\\Hate-Speech-Classification\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\Hate-Speech-Classification'

In [13]:
# Define the entity
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    trained_model_path: Path
    x_test_data_path: Path
    x_train_data_path: Path
    y_test_data_path: Path
    Random_state: int
    Epoch: int
    Batch_size: int
    Validation_Split: float
    Max_Words: int
    Max_Len: int
    Loss: str
    Metrics: list
    Activation: str
    test_size: float
    layers: int

In [14]:
# Configuration manager
from Hate_Speech_Classification.constrants import * # Import Everything
from Hate_Speech_Classification.utils.common import read_yaml,create_directories 

In [15]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath=CONFIG_FILE_PATH,  # Return Box Type  # Ctrl+click to check the file path
            params_filepath=PARAMS_FILE_PATH):

            self.config=read_yaml(config_filepath)
            self.params=read_yaml(params_filepath)

            # From common.py
            create_directories([self.config.artifacts_root]) # I can call using the key name using Box Type

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        
        create_directories([config['root_dir']])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            trained_model_path=config.trained_model_path,
            x_test_data_path=config.x_test_data_path,
            x_train_data_path=config.x_train_data_path,
            y_test_data_path=config.y_test_data_path,
            Random_state=params.Random_state,
            Epoch=params.Epoch,
            Batch_size=params.Batch_size,
            Validation_Split=params.Validation_Split,
            Max_Words=params.Max_Words,
            Max_Len=params.Max_Len,
            Loss=params.Loss,
            Metrics=params.Metrics,
            Activation=params.Activation,
            test_size=params.test_size,
            layers=params.layers
            
        )

        return model_trainer_config

In [8]:
import os 
import sys
import pickle
import pandas as pd
from Hate_Speech_Classification.Logging import logging
from Hate_Speech_Classification.constrants import * # Import Everything
from Hate_Speech_Classification.Exception import CustomException
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM,Activation,Dense,Dropout,Input,Embedding,SpatialDropout1D




In [16]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        
        


    def spliting_data(self):
        try:
            logging.info("Entered the spliting_data function")
            df = pd.read_csv("artifacts/data_transformation/final.csv", index_col=False)
            logging.info("Splitting the data into x and y")
            x = df["tweet"]
            y = df["label"]

            # Handle NaN and non-string values in the 'tweet' column
            logging.info("Checking and handling NaN or float values in the text data")
            x = x.fillna('')  # Replace NaN with empty strings
            x = x.apply(lambda text: str(text))  # Convert any float or other types to strings

            # Optional: Convert text to lowercase
            x = x.apply(lambda text: text.lower())

            logging.info("Applying train_test_split on the data")
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=self.config.test_size, random_state=self.config.Random_state
            )

            logging.info(f"Train size: {len(x_train)}, Test size: {len(x_test)}")
            return x_train, x_test, y_train, y_test
        except Exception as e:
            raise CustomException(e, sys)



    def tokenizing(self, x_train):
        try:
            logging.info("Tokenizing the data")
            tokenizer = Tokenizer(num_words=self.config.Max_Words)
            tokenizer.fit_on_texts(x_train)
            sequences = tokenizer.texts_to_sequences(x_train)
            sequences_matrix = pad_sequences(sequences, maxlen=self.config.Max_Len)
            return sequences_matrix, tokenizer
        except Exception as e:
            raise CustomException(e, sys)
        

    

        


    def get_model(self):
        try:
            model = Sequential()
            model.add(Embedding(input_dim=self.config.Max_Words, output_dim=self.config.layers, input_length=self.config.Max_Len))
            model.add(SpatialDropout1D(0.2))
            model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
            model.add(Dense(1, activation=self.config.Activation))
            model.summary()
            model.compile(loss=self.config.Loss, optimizer=RMSprop(), metrics=self.config.Metrics)
            return model
        except Exception as e:
            raise CustomException(e, sys)




    def initiate_model_trainer(self):
        try:
            logging.info("Initiating model training")
            x_train, x_test, y_train, y_test = self.spliting_data()

            model = self.get_model()

            sequences_matrix, tokenizer = self.tokenizing(x_train)

            logging.info("Training the model")
            model.fit(sequences_matrix, y_train, batch_size=self.config.Batch_size, epochs=self.config.Epoch, validation_split=self.config.Validation_Split)

            logging.info("Saving tokenizer and model")
            with open('tokenizer.pickle', 'wb') as handle:
                pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
            os.makedirs(self.config.root_dir, exist_ok=True)
            model.save(self.config.trained_model_path)

            logging.info("Saving test and train data")
            x_test.to_csv(self.config.x_test_data_path)
            y_test.to_csv(self.config.y_test_data_path)
            x_train.to_csv(self.config.x_train_data_path)

            return {
                "trained_model_path": self.config.trained_model_path,
                "x_test_path": self.config.x_test_data_path,
                "y_test_path": self.config.y_test_data_path,
            }
        except Exception as e:
            raise CustomException(e, sys)

In [17]:
# Pipeline
try:
    config=ConfigurationManager()  
    model_trainer_config=config.get_model_trainer_config()
    model_trainer=ModelTrainer(config=model_trainer_config)
    model_trainer=model_trainer.initiate_model_trainer()

except Exception as e:
    raise CustomException(e, sys)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 300, 100)          5000000   
                                                                 
 spatial_dropout1d_2 (Spati  (None, 300, 100)          0         
 alDropout1D)                                                    
                                                                 
 lstm_2 (LSTM)               (None, 100)               80400     
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 5080501 (19.38 MB)
Trainable params: 5080501 (19.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


  saving_api.save_model(
