Data Preprocessing and Model Training

In [None]:
!pip install pandas scikit-learn nltk fastapi uvicorn joblib




Importing  Libraries and Downloading NLTK Resources

In [None]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import logging

# Downloadind stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Setup logging to see info messages
logging.basicConfig(level=logging.INFO)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Load the Dataset

In [None]:
# Read the CSV dataset.
#
data_path = "/content/ecommerceDataset.csv"
df = pd.read_csv(data_path, header=None, names=['category', 'text'])

logging.info(f"Dataset shape: {df.shape}")
logging.info("Sample data:")
logging.info(df.head())


 Preprocess the Text

In [None]:
def preprocess_text(text):
    import pandas as pd
    # Checking if the text is a valid string; if not, return an empty string
    if pd.isnull(text) or not isinstance(text, str):
        return ""
    # Converting text to lowercase
    text = text.lower()
    # Remove punctuation
    text = "".join(ch for ch in text if ch not in string.punctuation)
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)


In [None]:
df['clean_text'] = df['text'].apply(preprocess_text)


Spliting the Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['category'],
                                                    test_size=0.2, random_state=42)

logging.info(f"Training set size: {len(X_train)}")
logging.info(f"Test set size: {len(X_test)}")


Transforming Text Data Using TF-IDF

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


Training the Logistic Regression Model

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


Evaluate the Model

In [None]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Test Accuracy: 0.9604
Classification Report:
                        precision    recall  f1-score   support

                 Books       0.97      0.95      0.96      2387
Clothing & Accessories       0.98      0.97      0.97      1744
           Electronics       0.96      0.94      0.95      2067
             Household       0.95      0.97      0.96      3887

              accuracy                           0.96     10085
             macro avg       0.96      0.96      0.96     10085
          weighted avg       0.96      0.96      0.96     10085



Saving the Model and Vectorizer (For Later Use)

In [None]:
import joblib


joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(model, 'logistic_model.pkl')
print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


Creating the main.py File


In [None]:
%%writefile main.py
import joblib
import string
import nltk
from nltk.corpus import stopwords


nltk.download('stopwords')

# Load stopwords.
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocess the input text by converting to lowercase, removing punctuation, and eliminating stopwords.
    """
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Lowercase the text.
    text = "".join(ch for ch in text if ch not in string.punctuation)  # Remove punctuation.
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords.
    return " ".join(words)

def load_model_and_vectorizer():
    """
    Load the pre-trained TF-IDF vectorizer and Logistic Regression model using joblib.
    """
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
    model = joblib.load('logistic_model.pkl')
    return vectorizer, model

def predict_category(text, vectorizer, model):
    """
    Predict the category for the input text.
    """
    clean_text = preprocess_text(text)
    text_vect = vectorizer.transform([clean_text])
    prediction = model.predict(text_vect)
    return prediction[0]

if __name__ == '__main__':
    # Load the model and vectorizer.
    vectorizer, model = load_model_and_vectorizer()

    # Prompt the user to enter a text inquiry.
    user_input = input("Enter your inquiry: ")

    # Predict and display the category.
    result = predict_category(user_input, vectorizer, model)
    print(f"Predicted Category: {result}")


Overwriting main.py


In [None]:
!python main.py



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Enter your inquiry: I need a new laptop for programming
Predicted Category: Electronics


FastAPI Endpoint Setup

In [None]:
%%writefile app.py
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import string
import nltk

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = "".join(ch for ch in text if ch not in string.punctuation)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

# Loading the saved TF-IDF vectorizer and Logistic Regression model.
vectorizer = joblib.load('tfidf_vectorizer.pkl')
model = joblib.load('logistic_model.pkl')

def predict_category(text):
    clean_text = preprocess_text(text)
    text_vect = vectorizer.transform([clean_text])
    prediction = model.predict(text_vect)
    return prediction[0]

# Createing FastAPI app
app = FastAPI(title="Text Classification API")

# Define a request model using Pydantic
class TextData(BaseModel):
    inquiry: str

# Createing an endpoint to predict the category
@app.post("/predict")
def get_prediction(data: TextData):
    category = predict_category(data.inquiry)
    return {"predicted_category": category}




Overwriting app.py


Installing ngrok (via pyngrok)

In [None]:
!pip install pyngrok




 FastAPI Server on a Local Port

In [None]:
!nohup uvicorn app:app --reload --host 127.0.0.1 --port 8000 > /dev/null 2>&1 &



In [None]:
!pgrep uvicorn


17522
50101


 Auth Token in Colab

In [None]:
from pyngrok import ngrok

# NGROK_AUTHTOKEN
ngrok.set_auth_token("2vteX44DmakJ4iFwe9n03FV6Cvi_it8crQrmh3nnTgMGYCXm")




Restart the ngrok Tunnel

In [None]:
from pyngrok import ngrok

# Explicitly specify the protocol as "http"
public_url = ngrok.connect(8000, "http")
print("Public URL:", public_url)


Public URL: NgrokTunnel: "https://6877-34-125-182-153.ngrok-free.app" -> "http://localhost:8000"


In [59]:
!pip freeze > requirements.txt


{"inquiry": "I need a new laptop for programming"}
{"inquiry": "I need a vacuum cleaner for home cleaning"}
{"inquiry": "Suggest a good mystery novel"}

