In [21]:
import numpy as np
import pandas as pd
import re
import string
import pickle
import joblib

In [22]:
# function for Punctuations
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [23]:
# Load the English stopword list 
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [None]:
# Load the saved TF‑IDF vectorizer 
tfidf = joblib.load("../static/model/tfidf.pkl") 

In [25]:
# Load the saved LabelEncoder to decode prediction IDs back to labels
label_encoder = joblib.load("../static/model/label_encoder.pkl")

In [26]:
# Load the trained ML model 
with open('../static/model/model.pkl', 'rb') as f:
    model = pickle.load(f)

In [27]:
# Text preprocessing function
def preprocessing(text):
    data = pd.DataFrame([text], columns=['offering_ad'])
    # Convert all text to lowercase and clean extra spaces
    data["offering_ad"] = data["offering_ad"].apply(lambda x: " ".join(word.lower() for word in str(x).split()))
    # Remove URLs/links from text 
    data["offering_ad"] = data["offering_ad"].apply(lambda x: re.sub(r"http\S+|www\S+", "", str(x)))
    # Remove Punctuations
    data["offering_ad"] = data["offering_ad"].apply(remove_punctuations)
    # Remove Numbers
    data["offering_ad"] = data["offering_ad"].str.replace(r"\d+", "", regex=True)
    # remove stopwords from the text.
    data["offering_ad"] = data["offering_ad"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

    return data["offering_ad"]

In [49]:
txt = 'toyota aqua ටොයොටා ඇක්වා 2015 මෝටර් රථයක් විකිණීමට ඇත. හොඳ තත්ත්වයේ පවතින අතර කිලෝමීටර් 120000 ධාවනය කර ඇත. '
p_text = preprocessing(txt)

# Convert the preprocessed text into TF‑IDF numeric features
vectorized_text = tfidf.transform(p_text)

# Predict the category ID for the input text
prediction = model.predict(vectorized_text)
print(prediction)

# Convert the predicted ID to the original category name
decoded_prediction = label_encoder.inverse_transform(prediction)
print("Predicted Category:", decoded_prediction[0])

[16]
Predicted Category: vehicle_car
