In [2]:
import re
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


extra_stop_words = {"href", "com", "ii", "iii", "ie", "quot"}
stop_words = set(stopwords.words('english')).union(extra_stop_words)

def preprocess_text_for_bert_with_source_removal(text, is_description=False):
    # Remove HTML-like tags and special codes (e.g., &lt;b&gt;)
    text = re.sub(r'&lt;[^&]+&gt;', '', text)
    text = re.sub(r'<[^>]*>', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove source information
    if is_description:
        # Remove source at the beginning in descriptions (e.g., "Source - Description")
        text = re.sub(r'^\w+\s-\s', '', text)
    else:
        # Remove source in parentheses at the end of titles
        text = re.sub(r'\s\([^)]*\)$', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize and remove stop words
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into text
    return ' '.join(tokens)

def preproces_on_test(text):
    #text = preprocess_text_for_bert_with_source_removal(text)
    text = preprocess_text_for_bert_with_source_removal(text, is_description=True)
    tokenized = word_tokenize(text)
    porter_stemmer = PorterStemmer()
    stemmed = [porter_stemmer.stem(word) for word in tokenized]
    sentence = " ".join(stemmed)
    return sentence


In [1]:
import joblib

path = "/Users/basakdemirok/Desktop/2025 GUZ/ELE 513 ANN/PROJE/ELE513/model_saved/"
# Load the models
xgb_model_tf_idf = joblib.load(path + 'xgb_model_tf_idf.joblib')
rf_model_tfidf = joblib.load(path + 'rf_model_tf_idf.joblib')
knn_model_tfidf = joblib.load(path + 'knn_model_tf_idf.joblib')
vectorizer = joblib.load(path + 'tf_idf_vectorizer_fitted.joblib')


svm_model_tfidf = joblib.load(path + 'svm_model_tf_idf.joblib')
mnb_model_tfidf = joblib.load(path + 'mnb_model_tf_idf.joblib')
lr_model_tfidf = joblib.load(path + 'logistic_model_tf_idf.joblib')

In [3]:
test_text = "this news is about the world. In the World War 2, Germany did not wim the war."
test_text_sport = "Latest sport news includes American football, basketball, tennis, and soccer."
test_test_business = "The CEO of the XYZ company has just announced that they prepared a new product for the market. Also, CFO of the company will be replaced by the new CFO"
test_text_science = "The latest scientific news includes the discovery of a new element, the study of the effects of climate change, and the development of a new vaccine."


In [4]:
processed_text = preproces_on_test(test_text)
processed_text_sport = preproces_on_test(test_text_sport)
processed_text_business = preproces_on_test(test_test_business)
processed_text_science = preproces_on_test(test_text_science)

In [5]:
vectorized = vectorizer.transform([processed_text])
vectorized_sport = vectorizer.transform([processed_text_sport])
vectorized_business = vectorizer.transform([processed_text_business])
vectorized_science = vectorizer.transform([processed_text_science])

vectorized.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [6]:

print(lr_model_tfidf.predict(vectorized))
print(lr_model_tfidf.predict(vectorized_sport))
print(lr_model_tfidf.predict(vectorized_business))
print(lr_model_tfidf.predict(vectorized_science))


[0]
[1]
[3]
[3]
