In [1]:
from imblearn.over_sampling import SMOTE  # Import SMOTE instead of ADASYN

def train_model(train_data):
    # Vectorize the 'entity_name' feature
    vectorizer = TfidfVectorizer(dtype=np.float32, max_features=500)
    X = vectorizer.fit_transform(train_data['entity_name'])
    y = train_data['group_id']
    
    # Check class distribution
    class_counts = y.value_counts()
    print(f"Class distribution: {class_counts}")
    
    # Filter out classes with very few samples
    min_samples_per_class = 10
    valid_classes = class_counts[class_counts >= min_samples_per_class].index
    X_filtered = X[y.isin(valid_classes)]
    y_filtered = y[y.isin(valid_classes)]
    
    # Apply SMOTE instead of ADASYN
    try:
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X_filtered, y_filtered)
        print("Applied SMOTE resampling.")
    except ValueError as e:
        print(f"Resampling failed: {e}. Proceeding without resampling.")
        X_resampled, y_resampled = X_filtered, y_filtered
    
    # Limit the size of the data to manage memory usage
    max_samples = 5000
    if X_resampled.shape[0] > max_samples:
        X_resampled, _, y_resampled, _ = train_test_split(X_resampled, y_resampled, train_size=max_samples, random_state=42)
    
    # Split the data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42)
    
    # Classifiers to try
    classifiers = {
        'RandomForest': RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
        'LogisticRegression': LogisticRegression(solver='liblinear', max_iter=100),
        'SVC': SVC(kernel='linear', probability=True, random_state=42)
    }
    
    best_clf = None
    best_accuracy = 0
    best_f1_score = 0
    best_clf_name = None
    
    for clf_name, clf in classifiers.items():
        print(f"Training with {clf_name}...")
        clf.fit(X_train, y_train)
        val_predictions = clf.predict(X_val)
        val_accuracy = accuracy_score(y_val, val_predictions)
        val_f1_score = f1_score(y_val, val_predictions, average='macro')
        print(f"{clf_name} - Validation Accuracy: {val_accuracy:.2f}, F1 Score: {val_f1_score:.2f}")
        
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            best_f1_score = val_f1_score
            best_clf = clf
            best_clf_name = clf_name
    
    print(f"Best Classifier: {best_clf_name} with Accuracy: {best_accuracy:.2f} and F1 Score: {best_f1_score:.2f}")
    return best_clf, vectorizer


In [6]:
# Train and evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"Training with {clf_name}...")
    
    # Check if classifier is XGBoost (which requires dense input)
    if clf_name == 'XGBoost':
        X_train_dense = X_train.toarray()  # Convert to dense format for XGBoost
        X_val_dense = X_val.toarray()      # Convert to dense format for XGBoost
        clf.fit(X_train_dense, y_train)
        val_predictions = clf.predict(X_val_dense)
    else:
        clf.fit(X_train, y_train)
        val_predictions = clf.predict(X_val)
        
    val_accuracy = accuracy_score(y_val, val_predictions)
    val_f1_score = f1_score(y_val, val_predictions, average='macro')
    print(f"{clf_name} - Validation Accuracy: {val_accuracy:.2f}, F1 Score: {val_f1_score:.2f}")
    
    # Update best classifier based on accuracy
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_f1_score = val_f1_score
        best_clf = clf
        best_clf_name = clf_name


NameError: name 'classifiers' is not defined

In [10]:
import os
from imblearn.over_sampling import ADASYN, SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np

# Function to train the model using different classifiers
def train_model(train_data):
    # Vectorize the 'entity_name' feature
    vectorizer = TfidfVectorizer(dtype=np.float32, max_features=500)
    X = vectorizer.fit_transform(train_data['entity_name'])
    y = train_data['group_id']
    
    # Check class distribution
    class_counts = y.value_counts()
    print(f"Class distribution: {class_counts}")
    
    # Filter out classes with very few samples
    min_samples_per_class = 10
    valid_classes = class_counts[class_counts >= min_samples_per_class].index
    X_filtered = X[y.isin(valid_classes)]
    y_filtered = y[y.isin(valid_classes)]
    
    # Check the filtered class distribution
    filtered_class_counts = y_filtered.value_counts()
    print(f"Filtered class distribution:\n{filtered_class_counts}")
    
    # Resample if necessary
    if len(filtered_class_counts) > 1 and filtered_class_counts.min() >= min_samples_per_class:
        print("Class distribution is balanced, skipping resampling.")
        X_resampled, y_resampled = X_filtered, y_filtered
    else:
        # Use ADASYN for resampling (with sampling_strategy parameter adjustment)
        try:
            adasyn = ADASYN(sampling_strategy=0.5, random_state=42)
            X_resampled, y_resampled = adasyn.fit_resample(X_filtered, y_filtered)
            print("Applied ADASYN resampling.")
        except ValueError as e:
            print(f"ADASYN failed: {e}")
            print("Switching to SMOTE for resampling.")
            smote = SMOTE(random_state=42)
            X_resampled, y_resampled = smote.fit_resample(X_filtered, y_filtered)
    
    # Limit the size of the data to manage memory usage
    max_samples = 5000  # Further reduced number of samples
    if X_resampled.shape[0] > max_samples:
        X_resampled, _, y_resampled, _ = train_test_split(X_resampled, y_resampled, train_size=max_samples, random_state=42)
    
    # Split the data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42)
    
    # Classifiers to try
    classifiers = {
        'RandomForest': RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
        'LogisticRegression': LogisticRegression(solver='liblinear', max_iter=100),
        'SVC': SVC(kernel='linear', probability=True, random_state=42)
    }
    
    # Track the best model
    best_clf = None
    best_accuracy = 0
    best_f1_score = 0
    best_clf_name = None
    
    # Train and evaluate each classifier
    for clf_name, clf in classifiers.items():
        print(f"Training with {clf_name}...")

        # Convert to dense matrix if classifier is XGBoost
        if clf_name == 'XGBoost':
            X_train_dense = X_train.toarray()  # Convert sparse matrix to dense
            X_val_dense = X_val.toarray()      # Convert validation set to dense
            clf.fit(X_train_dense, y_train)
            val_predictions = clf.predict(X_val_dense)
        else:
            clf.fit(X_train, y_train)  # Other classifiers work with sparse data
            val_predictions = clf.predict(X_val)
        
        val_accuracy = accuracy_score(y_val, val_predictions)
        val_f1_score = f1_score(y_val, val_predictions, average='macro')
        print(f"{clf_name} - Validation Accuracy: {val_accuracy:.2f}, F1 Score: {val_f1_score:.2f}")
        
        # Update best classifier based on accuracy
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            best_f1_score = val_f1_score
            best_clf = clf
            best_clf_name = clf_name
    
    print(f"Best Classifier: {best_clf_name} with Accuracy: {best_accuracy:.2f} and F1 Score: {best_f1_score:.2f}")
    return best_clf, vectorizer

# Function to make predictions on the test data and save them in the required format
def predict_and_save_output(clf, vectorizer, test_data, output_path, constants):
    # Vectorize the 'entity_name' feature from test data
    X_test = vectorizer.transform(test_data['entity_name'])
    
    # Convert to dense format if XGBoost classifier
    if isinstance(clf, XGBClassifier):
        X_test = X_test.toarray()

    # Predict the group_id for the test data
    test_predictions = clf.predict(X_test)
    
    # Create predictions in the required format
    output = []
    for index, pred in zip(test_data['index'], test_predictions):
        # Map the group_id back to an entity name and format the entity value with units
        entity_name = test_data.loc[test_data['index'] == index, 'entity_name'].values[0]
        # Handle the units for predictions
        unit = constants.get(entity_name, 'unit')
        prediction = f"{pred} {unit}"
        output.append([index, prediction])
    
    # Convert the list to a DataFrame and save it to CSV
    output_df = pd.DataFrame(output, columns=['index', 'prediction'])
    output_df.to_csv(output_path, index=False)
    print(f"Predictions saved to {output_path}")

# Load the dataset
DATASET_FOLDER = r'C:\Users\DELL\OneDrive\Desktop\Amazon_ml_2024\Amazon ML\Amazon-ML-Challenge-2024\dataset'
train_data = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test_data = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))

# Load allowed units and constants
constants = {
    "item_weight": "gram",
    "item_height": "centimetre",
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Train the model
best_clf, vectorizer = train_model(train_data)

# Predict on the test dataset and save the output
output_path = os.path.join(DATASET_FOLDER, 'test_out.csv')
predict_and_save_output(best_clf, vectorizer, test_data, output_path, constants)


Class distribution: group_id
459516    2678
281678    1871
308856    1738
993359    1602
731432    1545
          ... 
362818       1
494658       1
679049       1
522504       1
220662       1
Name: count, Length: 235, dtype: int64
Filtered class distribution:
group_id
459516    2678
281678    1871
308856    1738
993359    1602
731432    1545
          ... 
737148      13
306956      11
594224      11
152339      11
686198      10
Name: count, Length: 201, dtype: int64
Class distribution is balanced, skipping resampling.
Training with RandomForest...
RandomForest - Validation Accuracy: 0.11, F1 Score: 0.01
Training with XGBoost...


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195], got [107694 120219 120569 124643 130591 132401 140266 145452 149159 150535
 150913 152057 152339 156839 171418 178778 178958 179080 180410 181357
 186035 192132 204245 208023 211213 221399 225091 226504 237000 241608
 245959 249638 252585 252782 254046 254449 267182 267482 268003 271537
 273748 275506 276700 281678 288715 296366 296953 297918 299791 306956
 308671 308856 311997 318770 329793 334327 347404 348551 355666 359859
 365637 369753 373285 375816 386873 396159 397856 407921 411423 412008
 416664 418636 426261 433914 442321 446789 449021 449805 453674 459271
 459516 462757 469317 479564 483370 486636 487566 488883 489118 501250
 507467 507619 507848 507988 518578 519155 522832 523149 524117 524635
 529606 549052 550840 557758 558374 558832 563130 564709 569206 569657
 589105 593600 599772 601746 609802 611510 625310 625842 628971 630390
 630869 639090 639475 639508 641642 648011 654649 658003 666046 675317
 681445 686198 701880 704724 709627 730429 731252 731432 737148 746096
 748919 749917 750220 751532 752266 767202 776058 788365 794161 801829
 801837 802198 804621 810266 825239 844474 847223 858439 860821 866516
 866950 872083 881883 884560 885644 892291 893692 898898 907907 908443
 916768 917343 918474 922709 926285 928606 929999 932012 933453 934747
 939129 939587 952353 953031 955292 957050 957185 965518 969033 978900
 983323 986984 991868 993359 997176 998545]