In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import optuna

# 1. Read and preprocess data
data_file = '/content/drive/MyDrive/ggg_sg.csv'
usecols = ['DocTone', 'ContextualText']
data = pd.read_csv(data_file, usecols=usecols)
data.dropna(subset=usecols, inplace=True)

# 2. Discretize sentiment scores
quantiles = data['DocTone'].quantile([0.2, 0.4, 0.6, 0.8]).values

def map_sentiment(score):
    if score <= quantiles[0]:
        return 'Strongly Negative'
    elif score <= quantiles[1]:
        return 'Negative'
    elif score <= quantiles[2]:
        return 'Neutral'
    elif score <= quantiles[3]:
        return 'Positive'
    else:
        return 'Strongly Positive'

data['Sentiment'] = data['DocTone'].apply(map_sentiment)

# 3. Encode labels
label_encoder = LabelEncoder()
data['SentimentLabel'] = label_encoder.fit_transform(data['Sentiment'])

# 4. Text tokenization and sequencing
texts = data['ContextualText'].astype(str).tolist()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_sequence_length = 100
word_index = tokenizer.word_index
data_padded = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# 5. Create Word2Vec-like embeddings
vocab_size = len(word_index) + 1
embedding_dim = 100
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length)

embedding_matrix = embedding_layer.get_weights()[0]

def get_average_embeddings(data_padded, embedding_matrix):
    embeddings = []
    for sequence in data_padded:
        valid_embeddings = [embedding_matrix[idx] for idx in sequence if idx != 0]
        if valid_embeddings:
            avg_embedding = np.mean(valid_embeddings, axis=0)
        else:
            avg_embedding = np.zeros(embedding_dim)
        embeddings.append(avg_embedding)
    return np.array(embeddings)

features = get_average_embeddings(data_padded, embedding_matrix)

# 6. Split data
labels = data['SentimentLabel'].values
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# 7. Prepare datasets for TFDF
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(1024)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(1024)

# 8. Hyperparameter tuning using Optuna
def objective(trial):
    params = {
        'num_trees': trial.suggest_int('num_trees', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_examples': trial.suggest_int('min_examples', 2, 10),
    }
    model = tfdf.keras.RandomForestModel(
        num_trees=params['num_trees'],
        max_depth=params['max_depth'],
        min_examples=params['min_examples'],
        task=tfdf.keras.Task.CLASSIFICATION
    )
    model.fit(train_dataset)
    evaluation = model.evaluate(test_dataset, return_dict=True)
    accuracy = evaluation['accuracy']
    return -accuracy

study = optuna.create_study()
study.optimize(objective, n_trials=10)

best_params = study.best_params

# 9. Train final model
best_params['min_examples'] = max(best_params['min_examples'], 5)  # Increase min_examples
final_model = tfdf.keras.RandomForestModel(
    num_trees=best_params['num_trees'],
    max_depth=best_params['max_depth'],
    min_examples=best_params['min_examples'],
    task=tfdf.keras.Task.CLASSIFICATION
)
final_model.fit(train_dataset)

# 10. Evaluate model
evaluation = final_model.evaluate(test_dataset, return_dict=True)
print("Test Accuracy:", evaluation['accuracy'])

# Predictions and classification report
y_pred = np.argmax(final_model.predict(test_dataset), axis=1)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# 11. Incremental learning (simulated)
incremental_model = tfdf.keras.RandomForestModel(
    num_trees=50,
    max_depth=best_params['max_depth'],
    min_examples=best_params['min_examples'],
    task=tfdf.keras.Task.CLASSIFICATION
)

for i in range(5):
    print(f"Training iteration {i+1}")
    incremental_model.fit(train_dataset)
    incremental_model.num_trees += 10

evaluation = incremental_model.evaluate(test_dataset, return_dict=True)
print("Incremental Model Test Accuracy:", evaluation['accuracy'])
