In [None]:
from flask import Flask, request, jsonify
from flask_cors import CORS
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import tensorflow as tf
import numpy as np
import re

# ================== CONFIG ==================
MODEL_PATH = r"D:\Analisis Opini Skincare\indobert_sentiment_model"
MAX_LEN = 128
LABELS = ["Negative", "Neutral", "Positive"]

# ================== LOAD MODEL & TOKENIZER ==================
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# ================== FLASK APP ==================
app = Flask(__name__)
CORS(app)

# ================== PREPROCESSING ==================
def preprocess(text):
    text = text.lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)  # remove mentions & links
    text = re.sub(r'#\w+', '', text)                  # remove hashtags
    text = re.sub(r'[^\x00-\x7f]', '', text)          # remove non-ascii
    text = re.sub(r"\s\s+", " ", text)                # remove multiple spaces
    return text.strip()

# ================== SINGLE PREDICTION ==================
@app.route('/predict', methods=['POST'])
def predict():
    raw_text = request.form.get("text")
    if not raw_text:
        return jsonify({"error": "Missing text"}), 400

    clean_text = preprocess(raw_text)
    encoded = tokenizer(
        clean_text,
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='tf'
    )

    # ================== PREDICTION ==================
    logits = model(encoded)[0]                        # output logits
    pred_probs = tf.nn.softmax(logits, axis=-1).numpy()  # convert to probabilities
    pred_label = int(np.argmax(pred_probs, axis=1)[0])
    pred_label_str = LABELS[pred_label]

    return jsonify({
        "input": raw_text,
        "cleaned": clean_text,
        "predicted_label": pred_label_str,
        "probabilities": [float(x) for x in pred_probs[0]]
    })

# ================== HEALTH CHECK ==================
@app.route('/', methods=['GET'])
def home():
    return jsonify({"message": "IndoBERT Sentiment API is running!"})

# ================== RUN ==================
if __name__ == '__main__':
    app.run(host="0.0.0.0", port=5000)


  from .autonotebook import tqdm as notebook_tqdm






All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at D:\Analisis Opini Skincare\indobert_sentiment_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.33.131:5000
Press CTRL+C to quit
127.0.0.1 - - [20/Aug/2025 14:35:27] "POST /predict HTTP/1.1" 200 -


In [None]:
from flask import Flask, request, jsonify
from flask_cors import CORS
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
import numpy as np
import pandas as pd
import re, string

MODEL_PATH = "indobert_sentiment_model"
TOKENIZER_NAME = "indobenchmark/indobert-base-p1"
MAX_LEN = 128
LABELS = ["Negative", "Neutral", "Positive"]

def build_model(bert_model_name, num_labels, max_len):
    indobert = TFAutoModel.from_pretrained(bert_model_name)
    input_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
    bert_output = indobert({'input_ids': input_ids, 'attention_mask': attention_mask})[0]
    cls_token = bert_output[:,0,:]
    out = tf.keras.layers.Dense(num_labels, activation='softmax')(cls_token)
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

num_labels = 3
model = build_model(TOKENIZER_NAME, num_labels, MAX_LEN)
model.load_weights(f"{MODEL_PATH}/variables/variables")
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

app = Flask(__name__)
CORS(app)

def preprocess(text):
    def strip_emoji(text):
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"
            u"\U0001F300-\U0001F5FF"
            u"\U0001F680-\U0001F6FF"
            u"\U0001F1E0-\U0001F1FF"
            u"\U00002500-\U00002BEF"
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u200d"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\u3030"
            "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)
    def strip_all_entities(text):
        text = text.replace('\r', '').replace('\n', ' ').lower()
        text = re.sub(r"(?:\@|https?\://)\S+", "", text)
        text = re.sub(r'[^\x00-\x7f]',r'', text)
        banned_list = string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
        table = str.maketrans('', '', banned_list)
        text = text.translate(table)
        return text
    def clean_hashtags(tweet):
        tweet = re.sub(r'#\w+\b', '', tweet)
        tweet = tweet.replace("#", "")
        return tweet
    def filter_chars(text):
        return " ".join("" if ('$' in word or '&' in word) else word for word in text.split())
    def remove_mult_spaces(text):
        return re.sub("\s\s+" , " ", text)
    text = strip_emoji(text)
    text = strip_all_entities(text)
    text = clean_hashtags(text)
    text = filter_chars(text)
    text = remove_mult_spaces(text)
    return text.strip()

@app.route('/predict', methods=['POST'])
def predict():
    raw_text = request.form.get("text")
    if not raw_text:
        return jsonify({"error": "Missing text"}), 400

    clean_text = preprocess(raw_text)
    encoded = tokenizer(
        clean_text,
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='tf'
    )
    pred_probs = model.predict({
        "input_ids": encoded["input_ids"],
        "attention_mask": encoded["attention_mask"]
    })
    pred_label = int(np.argmax(pred_probs, axis=1)[0])
    pred_label_str = LABELS[pred_label]

    return jsonify({
        "input": raw_text,
        "cleaned": clean_text,
        "predicted_label": pred_label_str,
        "probabilities": [float(x) for x in pred_probs[0]]
    })

@app.route('/predict_batch', methods=['POST'])
def predict_batch():
    # Accept file upload
    if 'file' not in request.files:
        return jsonify({"error": "Missing file"}), 400
    file = request.files['file']
    # Read file to DataFrame
    try:
        if file.filename.endswith('.csv'):
            df = pd.read_csv(file)
        elif file.filename.endswith('.xlsx'):
            df = pd.read_excel(file)
        else:
            return jsonify({"error": "File must be .csv or .xlsx"}), 400
    except Exception as e:
        return jsonify({"error": f"Error reading file: {str(e)}"}), 400

    # Kolom opini (boleh ganti ke 'opini', 'text', dsb sesuai format file)
    col_candidates = ['opini', 'text', 'tweet', 'komentar']
    for col in col_candidates:
        if col in df.columns:
            opini_col = col
            break
    else:
        return jsonify({"error": "Kolom opini/tweet/text tidak ditemukan dalam file"}), 400

    results = []
    texts = df[opini_col].fillna("").astype(str).tolist()
    cleaned_texts = [preprocess(t) for t in texts]
    batch_enc = tokenizer(
        cleaned_texts,
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='tf'
    )
    pred_probs = model.predict({
        "input_ids": batch_enc["input_ids"],
        "attention_mask": batch_enc["attention_mask"]
    })
    pred_labels = np.argmax(pred_probs, axis=1)
    for i, t in enumerate(texts):
        results.append({
            "input": t,
            "predicted_label": LABELS[int(pred_labels[i])],
            "probabilities": [float(x) for x in pred_probs[i]]
        })
    return jsonify({"results": results})

@app.route('/', methods=['GET'])
def home():
    return jsonify({"message": "IndoBERT Sentiment API is running!"})

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=5000)

Some layers from the model checkpoint at indobenchmark/indobert-base-p1 were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.



 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.20.10.4:5000
Press CTRL+C to quit
