In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

2024-11-06 18:24:18.599897: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:

# Custom transformer for data preprocessing
class DataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df = pd.read_csv(X, encoding='utf-8', header=None, 
                        names=['target', 'ids', 'date', 'flag', 'user', 'text'])
        df.drop(columns=['ids', 'date', 'flag', 'user'], inplace=True)
        df.dropna(inplace=True)
        df.reset_index(drop=True, inplace=True)
        return df

# Custom transformer for BERT preprocessing
class BertPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, preprocessor_url="https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"):
        self.preprocessor_url = preprocessor_url
        self.bert_preprocessor = hub.KerasLayer(preprocessor_url)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return self.bert_preprocessor(X)

# Custom estimator for BERT model
class BertClassifier(BaseEstimator):
    def __init__(self, bert_url="https://tfhub.dev/google/experts/bert/wiki_books/sst2/2",
                 batch_size=32, epochs=5):
        self.bert_url = bert_url
        self.batch_size = batch_size
        self.epochs = epochs
        self.model = None
    
    def build_model(self):
        bert_encoder = hub.KerasLayer(self.bert_url)
        text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
        preprocessed_text = BertPreprocessor()(text_input)
        outputs = bert_encoder(preprocessed_text)
        
        net = outputs['pooled_output']
        net = tf.keras.layers.Dropout(0.1)(net)
        net = tf.keras.layers.Dense(128, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.2)(net)
        net = tf.keras.layers.Dense(32, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.2)(net)
        net = tf.keras.layers.Dense(16, activation='relu')(net)
        net = tf.keras.layers.Dropout(0.2)(net)
        net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)
        
        return tf.keras.Model(inputs=[text_input], outputs=[net])
    
    def fit(self, X, y):
        self.model = self.build_model()
        self.model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        train_data = tf.data.Dataset.from_tensor_slices((X, y))\
            .batch(self.batch_size)\
            .prefetch(tf.data.AUTOTUNE)
        
        with tf.device('/GPU:0'):
            self.model.fit(train_data, epochs=self.epochs)
        return self
    
    def predict(self, X):
        return self.model.predict(X)

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', DataPreprocessor()),
    ('classifier', BertClassifier())
])

# Usage example
if __name__ == "__main__":
    # Load and preprocess data
    data_file = 'data.csv'
    processed_data = pipeline.named_steps['preprocessor'].fit_transform(data_file)
    
    # Prepare data for training
    X = processed_data['text'].values
    y = processed_data['target'].apply(lambda x: 0 if x == 0 else 1).values
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Train the model
    pipeline.named_steps['classifier'].fit(X_train, y_train)
    
    # Evaluate
    predictions = pipeline.named_steps['classifier'].predict(X_test)
    accuracy = np.mean((predictions > 0.5) == y_test)
    print(f"Test Accuracy: {accuracy:.2f}")