In [1]:
import pandas as pd

# Load the data
file_path = "data.csv"  
data = pd.read_csv(file_path)

data.head()

Unnamed: 0,productId,Title,userId,Time,Text,Cat1,Cat2,Cat3
0,B0002AQK70,PetSafe Staywell Pet Door with Clear Hard Flap,A2L6QTQQI13LZG,1344211200,We've only had it installed about 2 weeks. So ...,pet supplies,cats,cat flaps
1,B0002DK8OI,"Kaytee Timothy Cubes, 1-Pound",A2HJUOZ9R9K4F,1344211200,My bunny had a hard time eating this because t...,pet supplies,bunny rabbit central,food
2,B0006VJ6TO,Body Back Buddy,A14PK96LL78NN3,1344211200,would never in a million years have guessed th...,health personal care,health care,massage relaxation
3,B000EZSFXA,SnackMasters California Style Turkey Jerky,A2UW73HU9UMOTY,1344211200,"Being the jerky fanatic I am, snackmasters han...",grocery gourmet food,snack food,jerky dried meats
4,B000KV61FC,Premier Busy Buddy Tug-a-Jug Treat Dispensing ...,A1Q99RNV0TKW8R,1344211200,Wondered how quick my dog would catch on to th...,pet supplies,dogs,toys


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate sentence embeddings
X = model.encode(data["Text"].tolist())
y = data["Cat1"]

print("Sentences Vectorized")


Sentences Vectorized


In [4]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.784

Classification Report:
                       precision    recall  f1-score   support

       baby products       1.00      0.27      0.42       124
              beauty       0.85      0.80      0.83       425
grocery gourmet food       0.85      0.54      0.66       167
health personal care       0.69      0.88      0.77       624
        pet supplies       0.88      0.83      0.85       303
          toys games       0.80      0.85      0.83       357

            accuracy                           0.78      2000
           macro avg       0.85      0.69      0.73      2000
        weighted avg       0.81      0.78      0.77      2000



In [5]:
#hyperparameter-tuning
# scores = []
# estimators_range = range(100, 300,10)
# for k in estimators_range:
#     rfc = RandomForestClassifier(n_estimators=k,random_state=42)
#     rfc.fit(X_train, y_train)
#     y_pred = rfc.predict(X_test)
#     scores.append(accuracy_score(y_test, y_pred))

# import matplotlib.pyplot as plt
# %matplotlib inline
# # plt.plot(x_axis, y_axis)
# plt.plot(estimators_range, scores)
# plt.xlabel('Value of n_estimators for Random Forest Classifier')
# plt.ylabel('Testing Accuracy')


In [6]:
#save-model
import joblib
joblib.dump(rf_model, "random_forest_model.pkl")
joblib.dump(model, "vectorizer.pkl")

['vectorizer.pkl']

In [7]:
# Create embeddings
data["Embeddings"] = list(X)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Create child classifiers for each unique class in Cat1
child_classifiers = {}

for cat1_class in data["Cat1"].unique():
    # Filter data for the current Cat1 class
    filtered_data = data[data["Cat1"] == cat1_class]
    X_child = list(filtered_data["Embeddings"])
    y_child = filtered_data["Cat2"]
    
    # Skip if no data or only one class in Cat2
    if len(y_child.unique()) <= 1:
        print(f"Skipping {cat1_class} due to lack of unique classes in Cat2.")
        continue

    # Split data for the child classifier
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_child, y_child, test_size=0.2, random_state=42)
    
    # Create a pipeline with standardization and logistic regression
    child_pipeline = Pipeline([
        ("scaler", StandardScaler()),  # Standardize features
        ("logreg", LogisticRegression(random_state=42, max_iter=1000))  # Logistic Regression model
    ])
    
    # Train the child classifier
    child_pipeline.fit(X_train_c, y_train_c)
    
    # Evaluate the child classifier
    y_pred_c = child_pipeline.predict(X_test_c)
    print(f"\nChild Classifier for Cat1={cat1_class} Accuracy:", accuracy_score(y_test_c, y_pred_c))
    print(f"\nChild Classification Report for Cat1={cat1_class}:\n", classification_report(y_test_c, y_pred_c))
    
    # Save the child classifier
    child_classifiers[cat1_class] = child_pipeline



Child Classifier for Cat1=pet supplies Accuracy: 0.7563291139240507

Child Classification Report for Cat1=pet supplies:
                       precision    recall  f1-score   support

               birds       0.67      0.67      0.67         6
bunny rabbit central       0.50      0.33      0.40         3
                cats       0.62      0.72      0.67        90
                dogs       0.82      0.81      0.82       183
   fish aquatic pets       0.87      0.62      0.73        32
       small animals       0.00      0.00      0.00         2

            accuracy                           0.76       316
           macro avg       0.58      0.53      0.55       316
        weighted avg       0.76      0.76      0.76       316


Child Classifier for Cat1=health personal care Accuracy: 0.656093489148581

Child Classification Report for Cat1=health personal care:
                             precision    recall  f1-score   support

           baby child care       0.60      0.50  

In [9]:
# Create child classifiers for each unique class in Cat2
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.dummy import DummyClassifier

child_classifiers_level_2 = {}

for cat2_class in data["Cat2"].unique():
    # Filter data for the current Cat1 class
    filtered_data = data[data["Cat2"] == cat2_class]
    X_child = list(filtered_data["Embeddings"])
    y_child = filtered_data["Cat3"]
    
    # Skip if no data or only one class in Cat3
    if len(set(y_child)) <= 1:
        print(f"Skipping {cat2_class} due to lack of unique classes in Cat3.")
        continue

    # Split data for the child classifier
    try:
        X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
            X_child, y_child, test_size=0.1, random_state=42, stratify=y_child
        )
    except ValueError as e:
        print(f"Error splitting data for Cat1={cat1_class}: {e}, creating dummy classifier")
        dummy_classifier = DummyClassifier(strategy="most_frequent")
        dummy_classifier.fit(X_child, y_child)
        child_classifiers_level_2[cat2_class] = dummy_classifier
        continue
    
    # Create a pipeline with standardization and logistic regression
    child_pipeline = Pipeline([
        ("scaler", StandardScaler()),  # Standardize features
        ("logreg", LogisticRegression(random_state=42, max_iter=1000))  # Logistic Regression model
    ])
    
    # Train the child classifier
    child_pipeline.fit(X_train_c, y_train_c)
    
    # Evaluate the child classifier
    y_pred_c = child_pipeline.predict(X_test_c)
    print(f"\nChild Classifier for Cat2={cat2_class} Accuracy:", accuracy_score(y_test_c, y_pred_c))
    print(f"\nChild Classification Report for Cat2={cat2_class}:\n", classification_report(y_test_c, y_pred_c))
    
    # Save the child classifier
    child_classifiers_level_2[cat2_class] = child_pipeline


Child Classifier for Cat2=cats Accuracy: 0.6458333333333334

Child Classification Report for Cat2=cats:
                            precision    recall  f1-score   support

           beds furniture       0.50      0.43      0.46         7
       carriers strollers       0.00      0.00      0.00         1
                cat flaps       1.00      1.00      1.00         2
   educational repellents       0.00      0.00      0.00         1
feeding watering supplies       0.57      0.80      0.67         5
                     food       0.00      0.00      0.00         2
                 grooming       0.44      0.67      0.53         6
          health supplies       0.71      0.62      0.67         8
     litter housebreaking       0.88      0.88      0.88         8
                     toys       0.86      0.86      0.86         7
                   treats       0.00      0.00      0.00         1

                 accuracy                           0.65        48
                macro

In [10]:

# Create a copy of the original data to store predictions
results_df = data.copy()

# Predict using the parent classifier
parent_predictions = rf_model.predict(list(results_df["Embeddings"]))
results_df["Predicted_Cat1"] = parent_predictions

# Predict using the corresponding child classifier for each Cat1 prediction
child_predictions = []

for i, row in results_df.iterrows():
    predicted_cat1 = row["Predicted_Cat1"]
    
    if predicted_cat1 in child_classifiers:
        # Predict using the respective child classifier
        child_classifier = child_classifiers[predicted_cat1]
        embedding = row["Embeddings"]
        predicted_cat2 = child_classifier.predict([embedding])[0]
    else:
        # Default to a placeholder value if no child classifier is available
        predicted_cat2 = "Unknown"

    child_predictions.append(predicted_cat2)

# Add child predictions to the DataFrame
results_df["Predicted_Cat2"] = child_predictions

child_predictions_l2 = []

for i, row in results_df.iterrows():
    predicted_cat2 = row["Predicted_Cat2"]
    
    if predicted_cat2 in child_classifiers_level_2:
        # Predict using the respective child classifier
        child_classifier = child_classifiers_level_2[predicted_cat2]
        embedding = row["Embeddings"]
        predicted_cat3 = child_classifier.predict([embedding])[0]
    else:
        # Default to a placeholder value if no child classifier is available
        predicted_cat3 = "Unknown"

    child_predictions_l2.append(predicted_cat3)

# Add child predictions to the DataFrame
results_df["Predicted_Cat3"] = child_predictions_l2

# Drop embeddings to keep the output clean
results_df.drop(columns=["Embeddings"], inplace=True)

# Save the final results to an Excel file
output_path = "predicted_results.xlsx"
results_df.to_excel(output_path, index=False)

print(f"Predictions saved to {output_path}")


Predictions saved to predicted_results.xlsx
