In [1]:
import os

In [2]:
%pwd

'd:\\Hate-Speech-Classifier\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Hate-Speech-Classifier'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    max_words: int
    max_len: int
    batch_size: int
    epochs: int
    validation_split: float

In [6]:
from textClassification.constants import *
from textClassification.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            max_words = params.max_words,
            max_len = params.max_len,
            batch_size = params.batch_size,
            epochs = params.epochs,
            validation_split = params.validation_split
            
        )

        return model_trainer_config

In [8]:
import pandas as pd
import pickle
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from keras.optimizers import RMSprop
from sklearn.model_selection import train_test_split
import os
import json

In [9]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    
    def train(self):

        df = pd.read_csv(os.path.join(self.config.data_path,"main_df.csv"))
        df.tweet=df.tweet.astype(str)

        x = df['tweet']
        y = df['label']

        # Let's split the data into train and test
        x_train,x_test,y_train,y_test = train_test_split(x,y, random_state = 42)

        print(len(x_train),len(y_train))
        print(len(x_test),len(y_test))


        max_words = self.config.max_words
        max_len = self.config.max_len

        tokenizer = Tokenizer(num_words=max_words)
        tokenizer.fit_on_texts(x_train)


        sequences = tokenizer.texts_to_sequences(x_train)
        sequences_matrix = pad_sequences(sequences,maxlen=max_len)
        
        #saving tokenizer
        with open(os.path.join(self.config.root_dir,'tokenizer.pickle'), 'wb') as handle:
            pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


        # Creating model architecture.
        model = Sequential()
        model.add(Embedding(self.config.max_words,100,input_length=self.config.max_len))
        model.add(SpatialDropout1D(0.2))
        model.add(LSTM(100,dropout=0.2,recurrent_dropout=0.2))
        model.add(Dense(1,activation='sigmoid'))
        model.summary()

        model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

        # starting model training
        model.fit(sequences_matrix,y_train,batch_size=self.config.batch_size,epochs = self.config.epochs,validation_split=self.config.validation_split)

        test_sequences = tokenizer.texts_to_sequences(x_test)
        test_sequences_matrix = pad_sequences(test_sequences,maxlen=max_len)

        # Model evaluation
        accr = model.evaluate(test_sequences_matrix,y_test)

        metrics = {"eval": accr}

        with open(os.path.join(self.config.root_dir,'metrics.json'), "w") as file:
            json.dump(metrics, file)


        # Let's save the mdoel.
        model.save(os.path.join(self.config.root_dir,'model.h5'))


In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2025-01-05 18:50:28,363: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-01-05 18:50:28,439: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-05 18:50:28,439: INFO: common: created directory at: artifacts]
[2025-01-05 18:50:28,439: INFO: common: created directory at: artifacts/model_trainer]
42558 42558
14187 14187




[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m395s[0m 1s/step - accuracy: 0.8242 - loss: 0.3949 - val_accuracy: 0.9376 - val_loss: 0.1847
[1m444/444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 108ms/step - accuracy: 0.9305 - loss: 0.2043
