In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load original dataset
df = pd.read_csv("../data/ecommerce_furniture_dataset_2024.csv")

# Clean price and drop originalPrice
df['price'] = df['price'].replace(r'[$,]', '', regex=True).astype(float)
df = df.drop(columns=['originalPrice'], errors='ignore')
df['tagText'] = df['tagText'].fillna("Unknown")


In [2]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['tagText'] = le.fit_transform(df['tagText'])


In [3]:
df['title_lower'] = df['productTitle'].str.lower()

df['has_discount_words'] = df['title_lower'].apply(
    lambda x: int(any(word in x for word in ['off', 'discount', 'save', 'sale']))
)

df.drop('title_lower', axis=1, inplace=True)


In [4]:
df['has_discount_words'].value_counts()


has_discount_words
0    1672
1     328
Name: count, dtype: int64

In [5]:
df['has_discount_words'].value_counts()


has_discount_words
0    1672
1     328
Name: count, dtype: int64

In [6]:
tfidf = TfidfVectorizer(max_features=100, stop_words='english')
productTitle_tfidf = tfidf.fit_transform(df['productTitle'])

tfidf_df = pd.DataFrame(productTitle_tfidf.toarray(), columns=tfidf.get_feature_names_out())
df_final = pd.concat([df.drop('productTitle', axis=1).reset_index(drop=True), tfidf_df], axis=1)


In [7]:
df_final.shape


(2000, 104)

In [8]:
print("Final dataset shape:", df_final.shape)
df_final.head()


Final dataset shape: (2000, 104)


Unnamed: 0,price,sold,tagText,has_discount_words,adjustable,bed,bedroom,bedside,bench,black,...,tables,tv,upholstered,vanity,velvet,wardrobe,white,wicker,wood,wooden
0,46.79,600,99,0,0.0,0.0,0.209944,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.332654,0.0,0.0,0.0,0.0
1,169.72,0,99,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.300086,0.0,0.0
2,39.46,7,99,0,0.0,0.0,0.224799,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,111.99,0,99,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.344964,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,21.37,1,99,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.551425
