In [3]:
# Import libraries

import pandas as pd
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [95]:
#configuration

path = 'dataset/'
products = 'products-data*.tsv'
reviews = 'reviews-*.tsv'
products_column_names = ['id', 'category', 'product title']
review_column_names =['id', 'rating', 'review_text']
intersection_column = 'id'
test_size_ratio = 0.2
random_state_number = 42






In [46]:
# Read in the product-data*.tsv data files.
# Read in the reviews*.tsv data files.

def data_loader(path, name_of_file , column_names):
    current_items_list = []
    data_files = Path(path).glob(name_of_file)
    for file in data_files:
        current_items_list.append(pd.read_csv(file, sep = '\t', header = None, names = column_names))
    return pd.concat(current_items_list)

In [44]:
# Merge df_product and df_review into a new DataFrame  on the 'id' column. 
# The DataFrame will contain all the data.

def data_merger(left_data, right_data, intersection_column):
    return pd.merge(left=left_data, right=right_data, on = intersection_column)

In [68]:
def replacer(column, replace_from, replace_to, data_frame):
    return data_frame[column].replace(replace_from, replace_to, inplace=True)

In [47]:
df_product = data_loader(path,products,products_column_names)

In [48]:
df_review = data_loader(path,reviews,review_column_names)

In [77]:
full_dataset = pd.merge(df_product, df_review, on='id')

In [78]:
print(full_dataset.columns)
full_dataset.head(3)

Index(['id', 'category', 'product title', 'rating', 'review_text'], dtype='object')


Unnamed: 0,id,category,product title,rating,review_text
0,daa54754-af9c-41c0-b542-fe5eabc5919c,Kitchen,Bodum Travel Press 12-Ounce French Press Coffe...,5,Great!
1,ab602aca-9bad-4aa9-bd42-6ce24cdf8680,Kitchen,RH Forschner by Victorinox BladeSafe for Knife...,5,It fit perfectly on my new<br />Victorinox Fib...
2,f98cd8d3-d6fe-4ee3-8c9f-a18c1690f7a6,Kitchen,Bellemain 12-Cup Nonstick Madeleine Pan,5,Excellent pan - my daughter just made madelein...


In [79]:
print('Check the "category" column for any missing or invalid values that need to be cleaned.')
print(full_dataset['category'].value_counts())

Check the "category" column for any missing or invalid values that need to be cleaned.
Jewelry    607
Kitchen    513
Ktchen     134
Name: category, dtype: int64


In [80]:
# full_dataset = replacer('category','Ktchen','Kitchen',full_dataset)
#Replace 'Ktchen' with 'Kitchen'
full_dataset['category'].replace('Ktchen', 'Kitchen', inplace=True)

In [88]:
print( )
print('Verify values in the "class" column after cleaning the data.')
print(full_dataset['category'].value_counts())


Verify values in the "class" column after cleaning the data.
Kitchen    647
Jewelry    607
Name: category, dtype: int64


In [90]:
# Select product title, review and rating as input features.
# Select category as output feature.

input_features = full_dataset[['product title', 'review_text', 'rating']]
output_features = full_dataset['category']

In [96]:
# Split the data into training and test datasets.
# Take a random 80/20 train/test split of the data by selecting 80% of rows for training and 20% for model testing.

X_train, X_test, y_train, y_test = train_test_split(input_features, output_features, test_size=test_size_ratio, random_state=random_state_number)


In [98]:
# Create a ColumnTransformer with three transformations:
# TfidfVectorizer for the "product title" column,
# TfidfVectorizer for the "review_text" column and 
# MinMaxScaler() for numerical feature, "rating".
column_transformer = ColumnTransformer(
    [
        ('tilte_vectorizer', TfidfVectorizer(ngram_range=(1,3)), 'product title'),
        ('review_vectorizer', TfidfVectorizer(ngram_range=(1,3)), 'review_text'),
        ("num_preprocess", MinMaxScaler(), ['rating'])
    ]
    )

# Construct a Pipeline with the ColumnTransformer and a LogisticRegression estimator.
pipeline_transformer_lr = Pipeline([
    ('featurizer', column_transformer),
    # ('feature_selection', SequentialFeatureSelector(estimator=MultinomialNB(), direction='backward', cv=2, n_jobs=-1)),
    ('lg', LogisticRegression())
    ]
    )

# Fit the pipeline on the training data.
pipeline_transformer_lr.fit(X_train, y_train)

Pipeline(steps=[('featurizer',
                 ColumnTransformer(transformers=[('tilte_vectorizer',
                                                  TfidfVectorizer(ngram_range=(1,
                                                                               3)),
                                                  'product title'),
                                                 ('review_vectorizer',
                                                  TfidfVectorizer(ngram_range=(1,
                                                                               3)),
                                                  'review_text'),
                                                 ('num_preprocess',
                                                  MinMaxScaler(),
                                                  ['rating'])])),
                ('lg', LogisticRegression())])

In [99]:
# Evaluate the pipeline by scoring accuracy on the train and test sets.
print('Train Accuracy:', pipeline_transformer_lr.score(X_train, y_train))
print()
print('Test Accuracy:', pipeline_transformer_lr.score(X_test, y_test))

Train Accuracy: 0.9990029910269193

Test Accuracy: 0.9800796812749004


In [None]:
# Identify incorrect predictions made by the model.
incorrect_df = df[clf.predict(df) != df['class']]
incorrect_df