In [9]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from scipy.sparse import hstack

# If you saved your model and vectorizer via joblib or pickle:
import joblib
import json
import pickle 

In [10]:
vectorizer = joblib.load('tfidf_vectorizer.pkl')
xgb_model = joblib.load('xgb_model.pkl')

In [11]:
def remove_patterns(text):
    text = re.sub(r'http[s]?://\S+','', text)
    text = re.sub(r'\[.*?\]\(.*?\)','', text)
    text = re.sub(r'@\w+','', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.strip()

stemmer = PorterStemmer()

In [12]:
def stem_tokens(tokens):
    return " ".join(stemmer.stem(str(token)) for token in tokens)

# ------------------------------------------------------------------------------
# 3) Create a function that preprocesses *one* new text string
# ------------------------------------------------------------------------------
def preprocess_new_text(raw_text):
    # Lowercase
    text_lower = raw_text.lower()
    # Remove URLs, punctuation, etc.
    text_cleaned = remove_patterns(text_lower)
    # Tokenize
    tokens = word_tokenize(text_cleaned)
    # Stem
    stemmed_str = stem_tokens(tokens)
    # Numeric features
    num_chars = len(text_cleaned)
    num_sents = len(sent_tokenize(text_cleaned))
    return stemmed_str, num_chars, num_sents

In [15]:
new_text = """
I hate myself.
"""

stemmed_str, num_chars, num_sents = preprocess_new_text(new_text)

# Use your fitted TF–IDF vectorizer
new_text_tfidf = vectorizer.transform([stemmed_str])  

# Combine TF–IDF features with numeric features (must match what was done in training)
import numpy as np
new_text_num = np.array([[num_chars, num_sents]])  # shape (1,2)
new_text_combined = hstack([new_text_tfidf, new_text_num])  

In [16]:
prediction = xgb_model.predict(new_text_combined)
print("Predicted class:", prediction[0])

Predicted class: 6
