In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle


In [None]:
# Data Preprocessor
class DataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.df = None
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Read and preprocess the data
        df = pd.read_csv(X, encoding='utf-8', header=None, 
                        names=['target', 'ids', 'date', 'flag', 'user', 'text'])
        df.drop(columns=['ids', 'date', 'flag', 'user'], inplace=True)
        df.dropna(inplace=True)
        df = shuffle(df, random_state=42).reset_index(drop=True)
        
        # Convert target to binary
        df['target'] = df['target'].apply(lambda x: 0 if x == 0 else 1)
        
        self.df = df
        return df

# BERT Model
class BertModel(BaseEstimator):
    def __init__(self, batch_size=32, epochs=5):
        self.batch_size = batch_size
        self.epochs = epochs
        self.model = None
        self.bert_url = "https://tfhub.dev/google/experts/bert/wiki_books/sst2/2"
        self.preprocessor_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
    
    def build_model(self):
        # Create the BERT model
        text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
        preprocessor = hub.KerasLayer(self.preprocessor_url)
        encoder = hub.KerasLayer(self.bert_url)
        
        # Preprocess text
        preprocessed_text = preprocessor(text_input)
        outputs = encoder(preprocessed_text)
        
        # Build the neural network
        net = outputs['pooled_output']
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(128, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.2)(net)
        net = tf.keras.layers.Dense(32, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.2)(net)
        net = tf.keras.layers.Dense(16, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.2)(net)
        net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
        
        return tf.keras.Model(text_input, net)
    
    def fit(self, X, y):
        # Build and compile the model
        self.model = self.build_model()
        self.model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        # Prepare the dataset
        train_data = tf.data.Dataset.from_tensor_slices((X, y))\
            .batch(self.batch_size)\
            .prefetch(tf.data.AUTOTUNE)
        
        # Train the model
        print("Training the model...")
        with tf.device('/GPU:0' if len(tf.config.list_physical_devices('GPU')) > 0 else '/CPU:0'):
            self.model.fit(train_data, epochs=self.epochs)
        return self
    
    def predict(self, X):
        if self.model is None:
            raise ValueError("Model needs to be fitted before making predictions")
        return self.model.predict(X)
    
    def save_model(self, path):
        if self.model is not None:
            self.model.save(path, include_optimizer=False)
    
    def load_model(self, path):
        self.model = tf.keras.models.load_model(
            path, 
            custom_objects={'KerasLayer': hub.KerasLayer}
        )

# Create the pipeline
pipeline = make_pipeline(
    DataPreprocessor(),
    BertModel(batch_size=32, epochs=5)
)

# Main execution
if __name__ == "__main__":
    # Set random seed for reproducibility
    tf.random.set_seed(42)
    np.random.seed(42)
    
    # Check for GPU availability
    print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
    
    # Process the data
    data_file = 'data.csv'  # Replace with your data file path
    processed_data = pipeline.named_steps['datapreprocessor'].fit_transform(data_file)
    
    # Create smaller dataset for testing (1/100 of original size)
    original_shape = processed_data.shape
    new_size = original_shape[0] // 100
    smaller_data = processed_data.sample(n=new_size, random_state=42)
    
    # Prepare features and target
    X = smaller_data['text'].values
    y = smaller_data['target'].values
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Train the model
    print("Starting model training...")
    pipeline.named_steps['bertmodel'].fit(X_train, y_train)
    
    # Make predictions
    print("Making predictions...")
    predictions = pipeline.named_steps['bertmodel'].predict(X_test)
    
    # Calculate accuracy
    accuracy = np.mean((predictions > 0.5) == y_test)
    print(f"Test Accuracy: {accuracy:.2f}")
    
    # Save the model
    print("Saving the model...")
    pipeline.named_steps['bertmodel'].save_model('bert_sentiment_model')
    
    print("Pipeline execution completed successfully!")
