In [6]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Francis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Francis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Francis\AppData\Roaming\nltk_data...


True

In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Sample data (product phrases and their categories)
training_data = [("Red shoes for men", "Shoes"),
                 ("Blue dress for women", "Clothing"),
                 ("Portable Bluetooth speaker", "Electronics")]

# Preprocessing
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(tokens)

X_train = [preprocess_text(data[0]) for data in training_data]
y_train = [data[1] for data in training_data]

# Feature Extraction
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

# Model Training
classifier = SVC(kernel='linear')
classifier.fit(X_train_vectorized, y_train)

# Example prediction
new_phrase = "Black leather jacket"
X_new = vectorizer.transform([preprocess_text(new_phrase)])
predicted_category = classifier.predict(X_new)[0]
print("Predicted category:", predicted_category)

In [10]:
# Consolidate the Amazon data set into one csv (first 500 rows per category).

import os
import pandas as pd


directory = r'C:\Users\Francis\Desktop\MSDS\Term2\DMW\FinalProject\archive_amazon'
dfs = []
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path, usecols=['name', 'main_category', 'sub_category'], nrows=500)
        dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

output_file = r'C:\Users\Francis\Desktop\MSDS\Term2\DMW\FinalProject\archive_amazon\result.csv'
combined_df.to_csv(output_file, index=False)

print(f"Combined DataFrame saved to {output_file}")

Combined DataFrame saved to C:\Users\Francis\Desktop\MSDS\Term2\DMW\FinalProject\archive_amazon\result.csv


In [1]:
# Model training via NLTK and SVM.

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import pandas as pd
import numpy as np

file = r'C:\Users\Francis\Desktop\MSDS\Term2\DMW\FinalProject\archive_data\result_v0501.csv' # updated the file
df = pd.read_csv(file, usecols=['name', 'main_category', 'sub_category'])
x = df['name'].values.tolist()
y = df['main_category'].values.tolist()
training_data = [(x[i], y[i]) for i in range(len(x))]


# Preprocessing
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(tokens)

X_train = [preprocess_text(data[0]) for data in training_data]
y_train = [data[1] for data in training_data]

# Feature Extraction
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

# Model Training
classifier = SVC(kernel='linear')
classifier.fit(X_train_vectorized, y_train)

# Example prediction
new_phrase = "Black leather jacket"
X_new = vectorizer.transform([preprocess_text(new_phrase)])
predicted_category = classifier.predict(X_new)[0]
print("Predicted category:", predicted_category)

Predicted category: sports & fitness


In [2]:
# Category prediction based on trained model.

def predict_cat(phrase):
    X_new = vectorizer.transform([preprocess_text(phrase)])
    predicted_category = classifier.predict(X_new)[0]
    return predicted_category

In [3]:
# Generate csv of predicted categories from our data (Product Description).

file = r'C:\Users\Francis\Desktop\MSDS\Term2\DMW\FinalProject\archive_data\fileN.csv'
df = pd.read_csv(file, usecols=['Product_Description', 'Product_Category'])
df['Predicted_Category'] = df['Product_Description'].apply(predict_cat)

output_file = r'C:\Users\Francis\Desktop\MSDS\Term2\DMW\FinalProject\archive_data\predictedN2.csv' # updated the output
df.to_csv(output_file, index=False)

print(f"Predicted categories saved to {output_file}")

Predicted categories saved to C:\Users\Francis\Desktop\MSDS\Term2\DMW\FinalProject\archive_data\predictedN2.csv
