Add tensorflow libraries

In [233]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf


from tensorflow.keras import layers
from tensorflow.keras import losses


In [234]:
print(tf.__version__)

2.18.0


In [235]:
import pandas as pd
import os

"Technology",
        "Finance",
        "Design & Arts",
        "Engineering",
        "Health & Medicine",
        "Sports",
        "Volunteering",
        "Career Advice",
        "Startups / Entrepreneurship",
        "Study",
        "Internships/Jobs",
        "Buying/Selling",


In [236]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [297]:
base_dir = "data"
categories = ["technology", "sports","finance","design_arts","engineering","health_medicine","volunteering","career_advice","entrepreneurship","internships_jobs","study"]
texts = []
labels = []

for idx, category in enumerate(categories):
    csv_path = os.path.join(base_dir, category, f"{category}_posts_1000.csv")
    df = pd.read_csv(csv_path)
    for text in df["post"]:
        texts.append(text)
        labels.append(idx)  # 0 for technology, 1 for sports

print(f"Loaded {len(texts)} texts with {len(set(labels))} labels")

Loaded 11000 texts with 11 labels


In [298]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)

In [299]:
import tensorflow as tf

# Parameters
max_features = 10000  # Size of vocabulary
sequence_length = 50  # Max number of words per sample

# Create the TextVectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

# Adapt the vectorizer to training texts
vectorize_layer.adapt(train_texts)


In [300]:
# Convert lists to tensors
train_texts_ds = tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
test_texts_ds = tf.data.Dataset.from_tensor_slices((test_texts, test_labels))

# Apply text vectorization
def vectorize_text(text, label):
    return vectorize_layer(text), label

train_ds = train_texts_ds.map(vectorize_text)
test_ds = test_texts_ds.map(vectorize_text)

# Shuffle, batch, and prefetch
batch_size = 32

train_ds = train_ds.shuffle(10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)


In [302]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features + 1, 16),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GlobalAveragePooling1D(),  # Expects 3D input
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(11)  # For 10 classes
])

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

# Use pre-vectorized datasets for training
history = model.fit(
    train_ds,  # already vectorized
    validation_data=test_ds,
    epochs=10
)

Epoch 1/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 13ms/step - accuracy: 0.4833 - loss: 2.3068 - val_accuracy: 0.9786 - val_loss: 1.7893
Epoch 2/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.9615 - loss: 1.5420 - val_accuracy: 0.9968 - val_loss: 0.8331
Epoch 3/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.9945 - loss: 0.7153 - val_accuracy: 0.9986 - val_loss: 0.3536
Epoch 4/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.9974 - loss: 0.3407 - val_accuracy: 0.9995 - val_loss: 0.1739
Epoch 5/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9985 - loss: 0.1915 - val_accuracy: 0.9995 - val_loss: 0.0984
Epoch 6/10
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9986 - loss: 0.1236 - val_accuracy: 1.0000 - val_loss: 0.0614
Epoch 7/10
[1m275/275

In [304]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    tf.keras.layers.Activation('softmax')  # needed for probabilities
])


In [305]:
from tensorflow.keras import layers, losses

# Wrap vectorizer + trained model

In [306]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation('softmax')
])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=["accuracy"]
)



In [307]:
def predict_category(text):
    input_tensor = tf.constant([text])
    probs = export_model.predict(input_tensor)[0]  # softmax output: [prob_tech, prob_sports]

    # categories = ["technology", "sports","finance","design_arts","engineering",]
    predicted_index = np.argmax(probs)
    predicted_category = categories[predicted_index]

    # Format all category probabilities as percentages
    prob_percentages = {cat: f"{prob * 100:.2f}%" for cat, prob in zip(categories, probs)}

    print(f"Predicted Category: {predicted_category}")
    print("Probabilities:")
    for cat, perc in prob_percentages.items():
        print(f" - {cat}: {perc}")


In [308]:
print(predict_category("Played a great match of cricket with friends in the main ground."))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 248ms/step
Predicted Category: sports
Probabilities:
 - technology: 5.94%
 - sports: 48.68%
 - finance: 1.96%
 - design_arts: 3.49%
 - engineering: 5.95%
 - health_medicine: 2.20%
 - volunteering: 8.93%
 - career_advice: 3.41%
 - entrepreneurship: 2.62%
 - internships_jobs: 9.58%
 - study: 7.25%
None


In [309]:
print(predict_category("Attended event on Budget planning cost saving as a student"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Predicted Category: finance
Probabilities:
 - technology: 6.33%
 - sports: 9.29%
 - finance: 25.45%
 - design_arts: 4.65%
 - engineering: 7.38%
 - health_medicine: 2.94%
 - volunteering: 5.47%
 - career_advice: 16.27%
 - entrepreneurship: 10.27%
 - internships_jobs: 4.92%
 - study: 7.04%
None


In [310]:

@tf.function(input_signature=[tf.TensorSpec(shape=[None, 50], dtype=tf.int32)])
def model_wrapper(input):
    return model(input)

# concrete_func = model_wrapper.get_concrete_function()

# converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
# tflite_model = converter.convert()

# with open("model_fixed.tflite", "wb") as f:
#     f.write(tflite_model)

Creating the TFlite model for mobile app

In [311]:
prob_model = tf.keras.Sequential([
    model,  # already trained Dense(10) with logits
    tf.keras.layers.Activation('softmax')  # ensures output is probabilities
])

concrete_func = model_wrapper.get_concrete_function()

converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
tflite_model = converter.convert()

with open("model_with_softmax.tflite", "wb") as f:
    f.write(tflite_model)



'NoneType' object has no attribute 'name'


In [312]:
interpreter = tf.lite.Interpreter(model_path="model_with_softmax.tflite")
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
print("Input shape:", input_details[0]['shape'])   # ✅ [1, 50]
print("Input dtype:", input_details[0]['dtype'])   # ✅ int32


Input shape: [ 1 50]
Input dtype: <class 'numpy.int32'>


In [313]:
import tensorflow as tf
import numpy as np

# Load the TFLite model
interpreter = tf.lite.Interpreter(model_path="model_with_softmax.tflite")
interpreter.allocate_tensors()

# Get input and output index
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

input_index = input_details[0]['index']
output_index = output_details[0]['index']

# Vectorize a test sentence using your trained vectorizer
sample_text = "Attended event on Budget planning cost saving as a student"
vec = vectorize_layer(tf.constant([sample_text]))  # shape (1, 50)
vec = tf.cast(vec, tf.int32)

# Run inference
interpreter.set_tensor(input_index, vec.numpy())
interpreter.invoke()
logits = interpreter.get_tensor(output_index)[0]

# Convert logits to probabilities
probs = tf.nn.softmax(logits).numpy()

# Display
for cat, prob in zip(categories, probs):
    print(f"{cat}: {prob * 100:.2f}%")

print("Predicted Category:", categories[np.argmax(probs)])



technology: 6.33%
sports: 9.29%
finance: 25.45%
design_arts: 4.65%
engineering: 7.38%
health_medicine: 2.94%
volunteering: 5.47%
career_advice: 16.27%
entrepreneurship: 10.27%
internships_jobs: 4.92%
study: 7.04%
Predicted Category: finance


In [314]:
vocab = vectorize_layer.get_vocabulary()  # List of tokens in correct order


In [315]:
with open("vocab.txt", "w") as f:
    for token in vocab:
        f.write(token + "\n")


In [316]:
with open("labels.txt", "w") as f:
    for label in categories:
        f.write(label + "\n")
