In [3]:
# Import libraries

import pandas as pd
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [95]:
#configurations

path = 'dataset/'
products = 'products-data*.tsv'
reviews = 'reviews-*.tsv'
products_column_names = ['id', 'category', 'product title']
review_column_names =['id', 'rating', 'review_text']
intersection_column = 'id'
test_size_ratio = 0.2
random_state_number = 42


In [46]:

# This function loads the data given 'path','name_of_file' and 'column_names'
def data_loader(path, name_of_file , column_names):
    current_items_list = []
    data_files = Path(path).glob(name_of_file)
    for file in data_files:
        current_items_list.append(pd.read_csv(file, sep = '\t', header = None, names = column_names))
    return pd.concat(current_items_list)


# This function merges the two data frames based on a common column
def data_merger(left_data, right_data, intersection_column):
    return pd.merge(left=left_data, right=right_data, on = intersection_column)


# This function replaces the value of the 'column' in the 'data_frame'
def replacer(column, replace_from, replace_to, data_frame):
    return data_frame[column].replace(replace_from, replace_to, inplace=True)

In [106]:
# Read in the product-data*.tsv data files.
df_product = data_loader(path,products,products_column_names)

# Read in the reviews*.tsv data files.
df_review = data_loader(path,reviews,review_column_names)

# Merge df_product and df_review into a new DataFrame on the 'id' column. 
# The 'full_dataset' DataFrame will contain all the data.
full_dataset = pd.merge(df_product, df_review, on='id')

print(full_dataset.columns)
full_dataset.head(3)


print('Check the "category" column for any missing or invalid values that need to be cleaned.')
print(full_dataset['category'].value_counts())

# full_dataset = replacer('category','Ktchen','Kitchen',full_dataset)
#Replace 'Ktchen' with 'Kitchen'
full_dataset['category'].replace('Ktchen', 'Kitchen', inplace=True)
print( )
print('Verify values in the "class" column after cleaning the data.')
print(full_dataset['category'].value_counts())

# Select 'product title', 'review_text' and 'rating' as input features.
# Select 'category' as output feature.
input_features = full_dataset[['product title', 'review_text', 'rating']]
output_features = full_dataset['category']


# Split the data into training and test datasets.
# Take a random 80/20 train/test split of the data by selecting 80% of rows for training and 20% for model testing.
X_train, X_test, y_train, y_test = train_test_split(input_features, output_features, test_size=test_size_ratio, random_state=random_state_number)



# Create a ColumnTransformer with three transformations:
# TfidfVectorizer for the "product title" column,
# TfidfVectorizer for the "review_text" column and 
# MinMaxScaler() for numerical feature, "rating".
column_transformer = ColumnTransformer(
    [
        ('tilte_vectorizer', TfidfVectorizer(ngram_range=(1,3)), 'product title'),
        ('review_vectorizer', TfidfVectorizer(ngram_range=(1,3)), 'review_text'),
        ("num_preprocess", MinMaxScaler(), ['rating'])
    ]
    )

# Construct a Pipeline with the ColumnTransformer and a LogisticRegression estimator.
pipeline_transformer_lr = Pipeline([
    ('featurizer', column_transformer),
    # ('feature_selection', SequentialFeatureSelector(estimator=MultinomialNB(), direction='backward', cv=2, n_jobs=-1)),
    ('lg', LogisticRegression())
    ]
    )

# Fit the pipeline on the training data.
pipeline_transformer_lr.fit(X_train, y_train)


# Evaluate the pipeline by scoring accuracy on the train and test sets.
print()
print('Train Accuracy:', pipeline_transformer_lr.score(X_train, y_train))
print('Test Accuracy:', pipeline_transformer_lr.score(X_test, y_test))
print()

# Identify incorrect predictions made by the model.
incorrect_dataframe = full_dataset[pipeline_transformer_lr.predict(full_dataset) != full_dataset['category']]
incorrect_dataframe

# Check the incorrectly predicted sample.
print(full_dataset.loc[254, 'product title'])
print()
print('"golden" and "steel" in "product_title" have caused a model error and predict that as kitchen value')

Index(['id', 'category', 'product title', 'rating', 'review_text'], dtype='object')
Check the "category" column for any missing or invalid values that need to be cleaned.
Jewelry    607
Kitchen    513
Ktchen     134
Name: category, dtype: int64

Verify values in the "class" column after cleaning the data.
Kitchen    647
Jewelry    607
Name: category, dtype: int64

Train Accuracy: 0.9990029910269193
Test Accuracy: 0.9800796812749004

Tovolo Groovy Ice Pop Molds, Spring Green set of 6, 2-pack

"golden" and "steel" in "product_title" have caused a model error and predict that as kitchen value
