In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import nltk
nltk.download('punkt')  # Uncomment and run this line if you haven't downloaded the NLTK data
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load your dataset into a DataFrame
df = pd.read_csv("D:/SIH/shopmania.csv")

In [3]:
df

Unnamed: 0,Description,Category
0,boston red sox for men by boston red sox eau d...,Collectibles
1,twilight central park print,Collectibles
2,fox print,Collectibles
3,circulo de papel wall art,Collectibles
4,hidden path print,Collectibles
...,...,...
313701,swimming full face anti fog mask surface divin...,Water Sports
313702,deago anti fog swimming diving full face mask ...,Water Sports
313703,etc buys full face gopro compatible snorkel sc...,Water Sports
313704,men 039 s full face breathe free diving snorke...,Water Sports


In [6]:
# Tokenize the descriptions
df['Description'] = df['Description'].apply(lambda x: ' '.join(word_tokenize(x.lower())))

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Description'], df['Category'], test_size=0.2, random_state=42)

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features parameter as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [9]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_tfidf, y_train)

LogisticRegression(max_iter=1000)

In [10]:
# Make predictions on the test set
y_pred = classifier.predict(X_test_tfidf)

# Calculate accuracy and print the classification report
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.901086991170189
Classification Report:
                                           precision    recall  f1-score   support

                 3D Multimedia Equipment       0.00      0.00      0.00         1
                                   Adult       0.92      0.96      0.94      2063
           Air Compressors & Accessories       1.00      1.00      1.00         3
                        Air Conditioners       0.00      0.00      0.00         7
                        Alcoholic Drinks       0.00      0.00      0.00         4
                             All-In-Ones       0.00      0.00      0.00         4
                    Audio / DJ Equipment       0.80      0.50      0.62        64
                 Audio / Video Equipment       0.00      0.00      0.00         1
                        Auto Accessories       1.00      0.20      0.33         5
                 Baby & Children Apparel       0.70      0.53      0.60       413
                               Baby gear     

  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Function to classify user-provided input
def classify_product_description(input_text):
    input_text = ' '.join(word_tokenize(input_text.lower()))
    input_vector = tfidf_vectorizer.transform([input_text])
    predicted_class = classifier.predict(input_vector)
    return predicted_class[0]

# Example usage
user_input = "Nike shoes are nice"
predicted_category = classify_product_description(user_input)
print(f"Predicted Category: {predicted_category}")

Predicted Category: Men Footwear


In [23]:
import pickle

In [24]:
with open('product_classifier.pkl', 'wb') as file:
    pickle.dump((classifier, tfidf_vectorizer), file)

### Combining

In [3]:
from nltk.tokenize import word_tokenize
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import pipeline
import pickle

In [4]:
with open('product_classifier.pkl', 'rb') as file:
    classifier, tfidf_vectorizer = pickle.load(file)

# Function to classify user-provided input
def classify_product_description(input_text):
    input_text = ' '.join(word_tokenize(input_text.lower()))
    input_vector = tfidf_vectorizer.transform([input_text])
    predicted_class = classifier.predict(input_vector)
    return predicted_class[0]

In [5]:
def combined_pipeline(input_text):
    # Classify the product category
    product_category = classify_product_description(input_text)

    # Perform sentiment analysis
    sentiment_task = pipeline("sentiment-analysis")
    sentiment_result = sentiment_task(input_text)

    return {
        "Product Category": product_category,
        "Sentiment": sentiment_result[0]['label']
    }

In [25]:
comment = combined_pipeline("Apple Iphone is the best")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [26]:
comment

{'Product Category': 'Cell Phones Accessories', 'Sentiment': 'POSITIVE'}