In [2]:

import pandas as pd
df = pd.read_csv('/Users/nbhagat/hierarchy classification/transformed_dataset.csv')

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df['count'] = df.groupby(['Cat1', 'Cat2', 'Cat3']).transform('size')
df_rare = df[df['count'] < 10]
df = df[df['count'] >= 10]
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from typing import Dict, Tuple, List

class TextFeatureExtractor(BaseEstimator, TransformerMixin):
    """Custom transformer for text feature extraction"""
    def __init__(self, field, max_features=1000):
        self.field = field
        self.max_features = max_features
        self.vectorizer = TfidfVectorizer(
            max_features=max_features,
            stop_words='english',
            ngram_range=(1, 2)
        )
        
    def fit(self, X, y=None):
        # Fill NaN values with empty string
        text_data = X[self.field].fillna('')
        self.vectorizer.fit(text_data)
        return self
        
    def transform(self, X):
        # Fill NaN values with empty string
        text_data = X[self.field].fillna('')
        return self.vectorizer.transform(text_data)

class EnhancedHierarchicalClassifier:
    def __init__(
        self,
        text_max_features=1000,
        title_max_features=500,
        confidence_threshold=0.3,
        use_features_weights=True,
        use_class_weights=True
    ):
        self.text_max_features = text_max_features
        self.title_max_features = title_max_features
        self.confidence_threshold = confidence_threshold
        self.use_features_weights = use_features_weights
        self.use_class_weights = use_class_weights
        
        # Initialize feature extractors
        self.feature_extractors = [
            ('text', TextFeatureExtractor(
                field='Text',
                max_features=self.text_max_features
            )),
            ('title', TextFeatureExtractor(
                field='Title',
                max_features=self.title_max_features
            ))
        ]
        
        self.feature_extractor = FeatureUnion(self.feature_extractors)
        
        # Initialize classifiers and encoders
        self.classifiers = {
            'level1': None,
            'level2': {},
            'level3': {}
        }
        self.label_encoders = {
            'level1': LabelEncoder(),
            'level2': LabelEncoder(),
            'level3': LabelEncoder()
        }

    def preprocess_data(self, df):
        """Preprocess the input data"""
        df = df.copy()
        
        # Fill NaN values
        df['Text'] = df['Text'].fillna('')
        df['Title'] = df['Title'].fillna('')
        
        # Basic text cleaning
        df['Text'] = df['Text'].str.lower().str.strip()
        df['Title'] = df['Title'].str.lower().str.strip()
        
        # Remove special characters (optional)
        df['Text'] = df['Text'].str.replace(r'[^\w\s]', ' ', regex=True)
        df['Title'] = df['Title'].str.replace(r'[^\w\s]', ' ', regex=True)
        
        return df
    
    def _create_classifier(self, level: str, cat1: str = None, cat2: str = None):
        """Create a classifier with appropriate parameters"""
        return RandomForestClassifier(
            n_estimators=100,
            class_weight='balanced' if self.use_class_weights else None,
            n_jobs=-1,
            random_state=42
        )
    
    def fit(self, df: pd.DataFrame, verbose: bool = True):
        """Fit the hierarchical classifier"""
        if verbose:
            print("Preprocessing data...")
        
        # Preprocess data
        df = self.preprocess_data(df)
        
        if verbose:
            print("Extracting features...")
        
        # Extract features
        X = self.feature_extractor.fit_transform(df)
        
        # Encode labels
        y1_encoded = self.label_encoders['level1'].fit_transform(df['Cat1'])
        y2_encoded = self.label_encoders['level2'].fit_transform(df['Cat2'])
        y3_encoded = self.label_encoders['level3'].fit_transform(df['Cat3'])
        
        # Train level 1
        if verbose:
            print("Training Level 1 classifier...")
        
        self.classifiers['level1'] = self._create_classifier('level1')
        self.classifiers['level1'].fit(X, y1_encoded)
        
        # Train level 2 classifiers
        if verbose:
            print("Training Level 2 classifiers...")
        
        for cat1 in df['Cat1'].unique():
            mask = df['Cat1'] == cat1
            if sum(mask) > 0:
                clf = self._create_classifier('level2', cat1)
                clf.fit(X[mask], y2_encoded[mask])
                self.classifiers['level2'][cat1] = clf
        
        # Train level 3 classifiers
        if verbose:
            print("Training Level 3 classifiers...")
        
        for cat1 in df['Cat1'].unique():
            for cat2 in df[df['Cat1'] == cat1]['Cat2'].unique():
                mask = (df['Cat1'] == cat1) & (df['Cat2'] == cat2)
                if sum(mask) > 0:
                    clf = self._create_classifier('level3', cat1, cat2)
                    clf.fit(X[mask], y3_encoded[mask])
                    self.classifiers['level3'][(cat1, cat2)] = clf
    
    def predict(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """Predict classes for all levels"""

        df = self.preprocess_data(df)
        X = self.feature_extractor.transform(df)
        
        # Level 1 prediction
        y1_proba = self.classifiers['level1'].predict_proba(X)
        y1_pred = np.where(
            y1_proba.max(axis=1) >= self.confidence_threshold,
            self.label_encoders['level1'].inverse_transform(y1_proba.argmax(axis=1)),
            'unknown'
        )
        
        # Level 2 prediction
        y2_pred = np.array(['unknown'] * len(df))
        for cat1 in self.classifiers['level2']:
            mask = y1_pred == cat1
            if sum(mask) > 0:
                proba = self.classifiers['level2'][cat1].predict_proba(X[mask])
                pred = self.label_encoders['level2'].inverse_transform(proba.argmax(axis=1))
                y2_pred[mask] = np.where(
                    proba.max(axis=1) >= self.confidence_threshold,
                    pred,
                    'unknown'
                )
        
        # Level 3 prediction
        y3_pred = np.array(['unknown'] * len(df))
        for (cat1, cat2) in self.classifiers['level3']:
            mask = (y1_pred == cat1) & (y2_pred == cat2)
            if sum(mask) > 0:
                proba = self.classifiers['level3'][(cat1, cat2)].predict_proba(X[mask])
                pred = self.label_encoders['level3'].inverse_transform(proba.argmax(axis=1))
                y3_pred[mask] = np.where(
                    proba.max(axis=1) >= self.confidence_threshold,
                    pred,
                    'unknown'
                )
        
        return y1_pred, y2_pred, y3_pred

def evaluate_model(df_train: pd.DataFrame, df_test: pd.DataFrame):
    """Train and evaluate model"""
    # Check for missing values
    print("\nMissing values in training data:")
    print(df_train.isnull().sum())
    
    print("\nMissing values in test data:")
    print(df_test.isnull().sum())
    
    # Initialize model
    model = EnhancedHierarchicalClassifier(
        text_max_features=1000,
        title_max_features=500,
        confidence_threshold=0.3
    )
    
    print("\nTraining model...")
    model.fit(df_train)
    
    print("\nMaking predictions...")
    y1_pred, y2_pred, y3_pred = model.predict(df_test)
    
    # Evaluate
    print("\nEvaluation Results:")
    for level, (pred, true) in enumerate([
        (y1_pred, df_test['Cat1']),
        (y2_pred, df_test['Cat2']),
        (y3_pred, df_test['Cat3'])
    ], 1):
        print(f"\nLevel {level} Performance:")
        print(classification_report(true, pred))



In [5]:
feature_extractors = [
            ('text', TextFeatureExtractor(
                field='Text',
                max_features=1000
            )),
            ('title', TextFeatureExtractor(
                field='Title',
                max_features=1000
            ))
        ]
feature_extractor = FeatureUnion(feature_extractors)

In [6]:
X = df[['Title', 'Text']]
y1 = df['Cat1']
y2 = df['Cat2']
y3 = df['Cat3']

X = feature_extractor.fit_transform(X)

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.linear_model import LogisticRegression

class MultiModelClassifier:
    def __init__(self):
        self.models = {
            'random_forest': RandomForestClassifier(
                n_estimators=200,
                max_depth=None,
                min_samples_split=2,
                min_samples_leaf=1,
                n_jobs=-1,
                class_weight='balanced'
            ),
            'xgboost': XGBClassifier(
                n_estimators=200,
                max_depth=7,
                learning_rate=0.1,
                objective='multi:softmax',
                n_jobs=-1
            ),
            'lightgbm': LGBMClassifier(
                n_estimators=200,
                max_depth=7,
                learning_rate=0.1,
                objective='multiclass',
                verbose =  -1,
                n_jobs=-1
            ),
            'neural_network': MLPClassifier(
                hidden_layer_sizes=(256, 128, 64),
                activation='relu',
                solver='adam',
                max_iter=1000
            )
        }
        self.results = {}

    def evaluate_models(self, X_train, X_test, y_train, y_test, category_level):
        """Evaluate all models for a specific category level"""
        print(f"\nEvaluating models for {category_level}")
        
        for name, model in self.models.items():
            print(f"\nTraining {name}...")
            try:
                # Train the model
                model.fit(X_train, y_train)
                
                # Make predictions
                y_pred = model.predict(X_test)
                
                # Calculate metrics
                report = classification_report(y_test, y_pred)
                
                self.results[f"{category_level}_{name}"] = {
                    'model': model,
                    'report': report,
                    'predictions': y_pred
                }
                
                print(f"\n{name} Results:")
                print(report)
                
            except Exception as e:
                print(f"Error with {name}: {str(e)}")

def compare_models(X, y1, y2, y3):
    """Compare different models for each category level"""
    # Split data
    X_train, X_test, y1_train, y1_test, y2_train, y2_test, y3_train, y3_test = train_test_split(
        X, y1, y2, y3, test_size=0.2, random_state=42
    )
    
    # Initialize classifier
    classifier = MultiModelClassifier()
    
    # Evaluate for each level
    classifier.evaluate_models(X_train, X_test, y1_train, y1_test, "Level1")
    classifier.evaluate_models(X_train, X_test, y2_train, y2_test, "Level2")
    classifier.evaluate_models(X_train, X_test, y3_train, y3_test, "Level3")
    
    return classifier.results

# Enhanced version with hyperparameter tuning
from sklearn.model_selection import GridSearchCV

class TunedMultiModelClassifier:
    def __init__(self):
        self.model_params = {
            'random_forest': {
                'model': RandomForestClassifier(class_weight='balanced'),
                'params': {
                    'n_estimators': [100, 200, 300],
                    'max_depth': [None, 10, 20],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4]
                }
            },
            'xgboost': {
                'model': XGBClassifier(),
                'params': {
                    'n_estimators': [100, 200],
                    'max_depth': [5, 7, 9],
                    'learning_rate': [0.01, 0.1],
                    'min_child_weight': [1, 3, 5]
                }
            },
            'lightgbm': {
                'model': LGBMClassifier(verbose =  -1),
                'params': {
                    'n_estimators': [100, 200],
                    'max_depth': [5, 7, 9],
                    'learning_rate': [0.01, 0.1],
                    'num_leaves': [31, 63, 127]
                }
            }
        }
        self.results = {}

    def tune_and_evaluate(self, X_train, X_test, y_train, y_test, category_level):
        """Tune hyperparameters and evaluate models"""
        print(f"\nTuning and evaluating models for {category_level}")
        
        for name, config in self.model_params.items():
            print(f"\nTuning {name}...")
            try:
                # Create GridSearchCV object
                grid_search = GridSearchCV(
                    config['model'],
                    config['params'],
                    cv=5,
                    n_jobs=-1,
                    verbose=1
                )
                
                # Fit grid search
                grid_search.fit(X_train, y_train)
                
                # Get best model
                best_model = grid_search.best_estimator_
                
                # Make predictions
                y_pred = best_model.predict(X_test)
                
                # Calculate metrics
                report = classification_report(y_test, y_pred)
                
                self.results[f"{category_level}_{name}"] = {
                    'model': best_model,
                    'best_params': grid_search.best_params_,
                    'report': report,
                    'predictions': y_pred
                }
                
                print(f"\n{name} Results:")
                print(f"Best parameters: {grid_search.best_params_}")
                print(report)
                
            except Exception as e:
                print(f"Error with {name}: {str(e)}")

# Usage example
def main():
    # Load and preprocess your data
    # X = ... # Your feature matrix
    # y1, y2, y3 = ... # Your target variables
    
    # Basic model comparison
    print("Running basic model comparison...")
    results = compare_models(X, y1, y2, y3)
    
    # Tuned model comparison
    print("\nRunning tuned model comparison...")
    tuned_classifier = TunedMultiModelClassifier()
    
    # Split data
    X_train, X_test, y1_train, y1_test, y2_train, y2_test, y3_train, y3_test = train_test_split(
        X, y1, y2, y3, test_size=0.2, random_state=42
    )
    
    # Tune and evaluate for each level
    tuned_classifier.tune_and_evaluate(X_train, X_test, y1_train, y1_test, "Level1")
    tuned_classifier.tune_and_evaluate(X_train, X_test, y2_train, y2_test, "Level2")
    tuned_classifier.tune_and_evaluate(X_train, X_test, y3_train, y3_test, "Level3")
    
    return results, tuned_classifier.results

# Analysis of results
def analyze_results(basic_results, tuned_results):
    """Analyze and compare model performances"""
    performance_summary = pd.DataFrame(columns=[
        'Model', 'Level', 'Accuracy', 'Macro F1', 'Weighted F1'
    ])
    
    # Extract metrics from results
    for results in [basic_results, tuned_results]:
        for key, value in results.items():
            level = key.split('_')[0]
            model = '_'.join(key.split('_')[1:])
            
            # Parse classification report
            report_lines = value['report'].split('\n')
            accuracy = float(report_lines[-2].split()[-1])
            macro_f1 = float(report_lines[-3].split()[-2])
            weighted_f1 = float(report_lines[-2].split()[-2])
            
            performance_summary = performance_summary.append({
                'Model': model,
                'Level': level,
                'Accuracy': accuracy,
                'Macro F1': macro_f1,
                'Weighted F1': weighted_f1
            }, ignore_index=True)
    
    return performance_summary

if __name__ == "__main__":
    basic_results, tuned_results = main()
    performance_summary = analyze_results(basic_results, tuned_results)
    
    # Print summary
    print("\nPerformance Summary:")
    print(performance_summary.to_string())


Running basic model comparison...

Evaluating models for Level1

Training random_forest...

random_forest Results:
                      precision    recall  f1-score   support

       baby products       0.80      0.59      0.68       128
              beauty       0.78      0.79      0.79       421
grocery gourmet food       0.87      0.73      0.80       157
health personal care       0.70      0.79      0.74       618
        pet supplies       0.92      0.82      0.87       317
          toys games       0.76      0.78      0.77       344

            accuracy                           0.77      1985
           macro avg       0.80      0.75      0.77      1985
        weighted avg       0.78      0.77      0.78      1985


Training xgboost...
Error with xgboost: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5], got ['baby products' 'beauty' 'grocery gourmet food' 'health personal care'
 'pet supplies' 'toys games']

Training lightgbm...

lightgbm Resul

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



random_forest Results:
                            precision    recall  f1-score   support

       Other_baby products       0.29      0.20      0.24        10
Other_grocery gourmet food       0.00      0.00      0.00         2
        action toy figures       0.83      0.42      0.56        24
               arts crafts       0.50      0.29      0.36        21
           baby child care       1.00      0.86      0.92         7
         baby toddler toys       0.37      0.48      0.42        23
                 bath body       0.70      0.50      0.58        28
         bathing skin care       0.00      0.00      0.00         4
                 beverages       0.87      0.83      0.85        41
                     birds       0.75      0.43      0.55        14
             breads bakery       0.00      0.00      0.00         3
           breakfast foods       0.50      0.17      0.25        12
             building toys       0.79      0.85      0.81        26
      bunny rabbit cent

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



lightgbm Results:
                            precision    recall  f1-score   support

       Other_baby products       0.75      0.30      0.43        10
Other_grocery gourmet food       0.00      0.00      0.00         2
        action toy figures       0.75      0.38      0.50        24
               arts crafts       0.38      0.24      0.29        21
           baby child care       1.00      0.86      0.92         7
         baby toddler toys       0.42      0.48      0.45        23
                 bath body       0.78      0.50      0.61        28
         bathing skin care       0.00      0.00      0.00         4
                 beverages       0.86      0.78      0.82        41
                     birds       1.00      0.21      0.35        14
             breads bakery       0.00      0.00      0.00         3
           breakfast foods       0.00      0.00      0.00        12
             building toys       0.93      0.54      0.68        26
      bunny rabbit central  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



random_forest Results:
                                                       precision    recall  f1-score   support

                Other_baby products_bathing skin care       0.00      0.00      0.00         4
                        Other_baby products_diapering       1.00      0.71      0.83         7
                          Other_baby products_feeding       0.00      0.00      0.00         3
                            Other_baby products_gifts       1.00      0.40      0.57         5
                          Other_baby products_nursery       1.00      0.82      0.90        11
                           Other_baby products_safety       0.75      0.55      0.63        11
                        Other_baby products_strollers       0.75      0.75      0.75         4
                               Other_beauty_bath body       1.00      0.40      0.57         5
                               Other_beauty_fragrance       0.50      0.09      0.15        11
                         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



lightgbm Results:
                                                       precision    recall  f1-score   support

                Other_baby products_bathing skin care       0.00      0.00      0.00         4
                        Other_baby products_diapering       0.00      0.00      0.00         7
                          Other_baby products_feeding       0.00      0.00      0.00         3
                            Other_baby products_gifts       0.00      0.00      0.00         5
                          Other_baby products_nursery       0.00      0.00      0.00        11
                           Other_baby products_safety       0.00      0.00      0.00        11
                        Other_baby products_strollers       0.00      0.00      0.00         4
                               Other_beauty_bath body       0.00      0.00      0.00         5
                               Other_beauty_fragrance       0.00      0.00      0.00        11
                              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



random_forest Results:
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
                      precision    recall  f1-score   support

       baby products       0.79      0.61      0.69       128
              beauty       0.77      0.79      0.78       421
grocery gourmet food       0.81      0.77      0.79       157
health personal care       0.71      0.76      0.73       618
        pet supplies       0.92      0.81      0.87       317
          toys games       0.74      0.78      0.76       344

            accuracy                           0.77      1985
           macro avg       0.79      0.75      0.77      1985
        weighted avg       0.78      0.77      0.77      1985


Tuning xgboost...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Error with xgboost: 
All the 180 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below




random_forest Results:
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
                            precision    recall  f1-score   support

       Other_baby products       0.29      0.20      0.24        10
Other_grocery gourmet food       0.00      0.00      0.00         2
        action toy figures       1.00      0.38      0.55        24
               arts crafts       0.60      0.43      0.50        21
           baby child care       1.00      0.86      0.92         7
         baby toddler toys       0.37      0.57      0.45        23
                 bath body       0.68      0.54      0.60        28
         bathing skin care       0.00      0.00      0.00         4
                 beverages       0.88      0.85      0.86        41
                     birds       0.70      0.50      0.58        14
             breads bakery       0.00      0.00      0.00         3
           breakfast foods       0.67      0.17      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



lightgbm Results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'num_leaves': 31}
                            precision    recall  f1-score   support

       Other_baby products       0.00      0.00      0.00        10
Other_grocery gourmet food       0.00      0.00      0.00         2
        action toy figures       0.00      0.00      0.00        24
               arts crafts       0.00      0.00      0.00        21
           baby child care       0.00      0.00      0.00         7
         baby toddler toys       0.00      0.00      0.00        23
                 bath body       0.00      0.00      0.00        28
         bathing skin care       0.00      0.00      0.00         4
                 beverages       0.25      0.02      0.04        41
                     birds       0.00      0.00      0.00        14
             breads bakery       0.00      0.00      0.00         3
           breakfast foods       0.00      0.00      0.00        12
 



KeyboardInterrupt: 

In [39]:
X.shape

(9923, 2000)

In [11]:
df

Unnamed: 0,productId,Title,userId,Time,Text,Cat1,Cat2,Cat3
0,B0002AQK70,PetSafe Staywell Pet Door with Clear Hard Flap,A2L6QTQQI13LZG,1344211200,We've only had it installed about 2 weeks. So ...,pet supplies,cats,cat flaps
1,B0002DK8OI,"Kaytee Timothy Cubes, 1-Pound",A2HJUOZ9R9K4F,1344211200,My bunny had a hard time eating this because t...,pet supplies,bunny rabbit central,food
2,B0006VJ6TO,Body Back Buddy,A14PK96LL78NN3,1344211200,would never in a million years have guessed th...,health personal care,health care,massage relaxation
3,B000EZSFXA,SnackMasters California Style Turkey Jerky,A2UW73HU9UMOTY,1344211200,"Being the jerky fanatic I am, snackmasters han...",grocery gourmet food,snack food,jerky dried meats
4,B000KV61FC,Premier Busy Buddy Tug-a-Jug Treat Dispensing ...,A1Q99RNV0TKW8R,1344211200,Wondered how quick my dog would catch on to th...,pet supplies,dogs,toys
...,...,...,...,...,...,...,...,...
9995,B000FGDDI0,Sunbeam 732-500 King Size Heating Pad with Ult...,A3RUBUKF0YX4C7,1362182400,Stays on continuously without shutting off! It...,health personal care,health care,pain relievers
9996,B000FVC78C,Reef One Biorb Easy Plants,A1O9H18FJG81FS,1362182400,these look great in our 10 gallon tank- colors...,pet supplies,fish aquatic pets,aquarium d cor
9997,B000ICJ8DA,Snoozer Lookout II Pet Car Seat,A3D96MTZP9C1Y,1362182400,"This works great, but needs a better way to at...",pet supplies,dogs,carriers travel products
9998,B000Q7AH3W,Omega Paw Tricky Treat Ball,A37L6DBOH234BC,1362182400,she absolutely LOVES this thing. I dice up gre...,pet supplies,dogs,toys


In [None]:
df = pd.read_csv('data.csv')
df=df[['Title', 'Text', 'Cat1', 'Cat2','Cat3']]

In [16]:
df.describe()

Unnamed: 0,Title,Text,Cat1,Cat2,Cat3
count,9995,10000,10000,10000,10000
unique,6512,9854,6,64,377
top,China Glaze Nail Lacquer with Hardeners,Very good product! This smells good and feels ...,health personal care,nutrition wellness,vitamins supplements
freq,89,11,2992,904,665


In [4]:
import xgboost