# Implementing the first pipeline for processing text using GradientBoost
* Implemented a baseline text-based pipeline to predict product prices using the training data.
* Cleaned product descriptions to remove newlines, extra spaces, and standardize text.
* Extracted numeric `Value` and `Unit` from product descriptions to use as features.
* Applied one-hot encoding to `Unit` and TF-IDF vectorization to the cleaned text for numerical representation.
* Combined numeric, categorical, and text features into a single dataset for modeling.
* Trained a **Gradient Boosting Regressor** on this combined dataset.
* Predicted prices on a subset of the data and evaluated using **SMAPE**.
* Achieved **Subset SMAPE: 36.68%**, serving as a baseline for model performance on the initial pipeline.
##Next steps :
* We will try XGBoost instead of Gradient Boosting in the next notebook
* Reason: XGBoost is often faster and can give better accuracy for tabular data.
* We will reuse the same cleaned and processed text + numeric + one-hot features.

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv(
    'train.csv',
    sep=',',
    quotechar='"',       # handle quoted text
    engine='python',     # Python engine handles multiline text
    on_bad_lines='skip'  # skip malformed lines
)

In [None]:
test = pd.read_csv(
    'test.csv',
    sep=',',
    quotechar='"',
    engine='python',
    on_bad_lines='skip'
)

In [None]:
train_small = train.sample(n=1000, random_state=42)  # 1000 rows, change n as needed
test_small = test.sample(n=1000, random_state=42)

In [None]:
print("Subset Train shape:", train_small.shape)
print("Subset Test shape:", test_small.shape)
print(train_small.head())

Subset Train shape: (1000, 4)
Subset Test shape: (1000, 3)
      sample_id                                    catalog_content  \
282      293950  Item Name: Yakami Orchard Japanese Yuzu Marmal...   
2008      80692  Item Name: Sweet Sue Chunk White Chicken in Wa...   
1713      65727  Item Name: Octonuts Dry Roasted Maple Almond N...   
1666        256  Item Name: POSHI Marinated French Green Bean S...   
2066     139960  Item Name: Mars SNICKERS, TWIX, MILKY WAY & 3 ...   

                                             image_link  price  
282   https://m.media-amazon.com/images/I/41S7nuYFld...  19.95  
2008  https://m.media-amazon.com/images/I/31kSwbxAI7...  10.32  
1713  https://m.media-amazon.com/images/I/51BD1O+4mR...   9.75  
1666  https://m.media-amazon.com/images/I/71h2JMdbvj...   2.15  
2066  https://m.media-amazon.com/images/I/61irQQmkZn...  10.00  


In [None]:
import re

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Replace newline characters with space
    text = text.replace('\n', ' ')
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [None]:
train_small['clean_text'] = train_small['catalog_content'].apply(clean_text)
test_small['clean_text'] = test_small['catalog_content'].apply(clean_text)

In [None]:
print(train_small[['catalog_content', 'clean_text']].head(2))

                                        catalog_content  \
282   Item Name: Yakami Orchard Japanese Yuzu Marmal...   
2008  Item Name: Sweet Sue Chunk White Chicken in Wa...   

                                             clean_text  
282   item name: yakami orchard japanese yuzu marmal...  
2008  item name: sweet sue chunk white chicken in wa...  


In [None]:
import re

def extract_value_unit(text):
    """
    Extract numeric Value and Unit from catalog_content.

    Returns:
        value (float) or None
        unit (str) or None
    """
    value, unit = None, None

    # Look for "Value: <number>" pattern (allow spaces and decimals)
    value_match = re.search(r'Value[: ]+\s*([\d\.]+)', text, re.IGNORECASE)
    if value_match:
        try:
            value = float(value_match.group(1))
        except:
            value = None

    # Look for "Unit: <word>" pattern
    unit_match = re.search(r'Unit[: ]+\s*([a-zA-Z]+)', text, re.IGNORECASE)
    if unit_match:
        unit = unit_match.group(1).lower()

    return value, unit

# Apply to train and test
train_small[['value_num', 'unit']] = train_small['catalog_content'].apply(
    lambda x: pd.Series(extract_value_unit(x))
)
test_small[['value_num', 'unit']] = test_small['catalog_content'].apply(
    lambda x: pd.Series(extract_value_unit(x))
)

# Check results
print(train_small[['catalog_content', 'value_num', 'unit']].head(5))


                                        catalog_content  value_num   unit
282   Item Name: Yakami Orchard Japanese Yuzu Marmal...       10.0  ounce
2008  Item Name: Sweet Sue Chunk White Chicken in Wa...        6.0  count
1713  Item Name: Octonuts Dry Roasted Maple Almond N...       16.0  ounce
1666  Item Name: POSHI Marinated French Green Bean S...       17.6  ounce
2066  Item Name: Mars SNICKERS, TWIX, MILKY WAY & 3 ...       16.0  ounce


In [None]:
# One-hot encode categorical 'unit' column
train_units = pd.get_dummies(train_small['unit'], prefix='unit', dummy_na=True)
test_units = pd.get_dummies(test_small['unit'], prefix='unit', dummy_na=True)

# Align test columns to train (some units may not appear in test)
test_units = test_units.reindex(columns=train_units.columns, fill_value=0)

# Replace unit column with one-hot
train_small = pd.concat([train_small, train_units], axis=1)
test_small = pd.concat([test_small, test_units], axis=1)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer, limit to 2000 features for speed
vectorizer = TfidfVectorizer(max_features=2000)

# Fit on train, transform both train and test
X_train_text = vectorizer.fit_transform(train_small['clean_text'])
X_test_text = vectorizer.transform(test_small['clean_text'])

In [None]:
# Ensure numeric arrays are float
X_train_num = train_small[['value_num']].fillna(0).astype(np.float64).values
X_test_num = test_small[['value_num']].fillna(0).astype(np.float64).values

# Ensure one-hot categorical arrays are float
X_train_cat = train_small[train_units.columns].astype(np.float64).values
X_test_cat = test_small[test_units.columns].astype(np.float64).values

# Now stack everything
X_train = hstack([X_train_text, X_train_num, X_train_cat])
X_test = hstack([X_test_text, X_test_num, X_test_cat])

In [None]:
y_train = train_small['price'].values

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Baseline model
model = GradientBoostingRegressor(
    n_estimators=200,
    max_depth=5,
    random_state=42
)

# Train on subset
model.fit(X_train, y_train)

# Predict on test subset
y_pred = model.predict(X_test)

In [None]:
# Attach predictions to test dataframe
output = test_small[['sample_id']].copy()
output['price'] = y_pred

print(output.head())

       sample_id      price
10906     259458  14.245655
10658      48563  39.111912
9500      148194  14.404646
16799     185297   9.869420
10324     232138  19.148389


In [None]:
# Only for subset with actual price (train_small for testing)
from sklearn.metrics import mean_absolute_error

def smape(y_true, y_pred):
    return 100 * np.mean(np.abs(y_pred - y_true) / ((np.abs(y_true) + np.abs(y_pred)) / 2))

# Evaluate on subset (train_small itself or holdout split)
smape_score = smape(train_small['price'], model.predict(X_train))
print(f"Subset SMAPE: {smape_score:.2f}%")

Subset SMAPE: 36.68%
