# Input Functions

#### import Modules 

In [2]:
pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.9.3-cp38-cp38-macosx_10_9_x86_64.whl (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.8/172.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.3
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from textblob import TextBlob
import re
import csv

## CHANGE PATH FOR YOU MACHINE

In [10]:
# Define the paths to the files
gift_card_path = '../data/extracted/amazon_reviews_us_Gift_Card_v1_00.tsv'
major_appliances_path = '../data/extracted/amazon_reviews_us_Major_Appliances_v1_00.tsv'
shoes_path = '../data/extracted/amazon_reviews_us_Shoes_v1_00.tsv'
electronics_path = '../data/extracted/amazon_reviews_us_Electronics_v1_00.tsv'

# Read the files into dataframes
df_gift_card = pd.read_csv(gift_card_path, sep='\t', on_bad_lines='skip')
df_major_appliances = pd.read_csv(major_appliances_path, sep='\t',  on_bad_lines='skip')
df_shoes = pd.read_csv(shoes_path, sep='\t',  on_bad_lines='skip')
df_electronics = pd.read_csv(electronics_path, sep='\t',  on_bad_lines='skip')


# Display the first few rows of each dataframe to verify
print("Gift Card Reviews DataFrame:")
display(df_gift_card.head(1))
print("\nMajor Appliances Reviews DataFrame:")
display(df_major_appliances.head(1))
print("Shoes Reviews DataFrame:")
display(df_shoes.head(1))
print("Electronics Reviews DataFrame:")
display(df_electronics.head(1))

Gift Card Reviews DataFrame:


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,24371595,R27ZP1F1CD0C3Y,B004LLIL5A,346014806,Amazon eGift Card - Celebrate,Gift Card,5,0,0,N,Y,Five Stars,Great birthday gift for a young adult.,2015-08-31



Major Appliances Reviews DataFrame:


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,16199106,R203HPW78Z7N4K,B0067WNSZY,633038551,"FGGF3032MW Gallery Series 30"" Wide Freestandin...",Major Appliances,5,0,0,N,Y,"If you need a new stove, this is a winner.",What a great stove. What a wonderful replacem...,2015-08-31


Shoes Reviews DataFrame:


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,18069663,R3P2HIOQCIN5ZU,B000XB31C0,265024781,Minnetonka Men's Double Deerskin Softsole Mocc...,Shoes,1,0,0,N,Y,.,Do not buy: really didn't start to wear them u...,2015-08-31


Electronics Reviews DataFrame:


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,41409413,R2MTG1GCZLR2DK,B00428R89M,112201306,yoomall 5M Antenna WIFI RP-SMA Female to Male ...,Electronics,5,0,0,N,Y,Five Stars,As described.,2015-08-31


# Read Data & Filter Based On User Inputs

In [12]:

def read_dataset(dataset_path):
    return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=False)

def merge_datasets(selected_category):
    paths = {
        'Gift Card': gift_card_path,
        'Major Appliances': major_appliances_path,
        'Shoes': shoes_path,
        'Electronics': electronics_path
    }
    if selected_category in paths:
        return read_dataset(paths[selected_category])
    elif selected_category == 'all':
        dfs = [read_dataset(path) for path in paths.values()]
        return pd.concat(dfs, ignore_index=True)

def remove_specific_columns(df):
    df.drop(columns=["customer_id", "review_id", "product_id"], inplace=True)
    return df

def modify_review_date_to_year(df):
    df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce').dt.year
    df.dropna(subset=['review_date'], inplace=True)
    df['review_date'] = df['review_date'].astype(int)
    return df

def categorize_votes(df, column_names):
    for column in column_names:
        category_col_name = f'{column}_category'
        df[category_col_name] = 'No Votes'  # Default category for 0 votes
        has_votes = df[column] > 0
        votes_data = df.loc[has_votes, column]
        
        unique_values = votes_data.unique()
        if len(unique_values) > 4:
            try:
                df.loc[has_votes, category_col_name] = pd.qcut(votes_data, q=4, labels=["Minimal Engagement", "Low Engagement", "Moderate Engagement", "High Engagement"])
            except ValueError:
                df.loc[has_votes, category_col_name] = pd.cut(votes_data, bins=4, labels=["Minimal Engagement", "Low Engagement", "Moderate Engagement", "High Engagement"])
        else:
            df.loc[has_votes, category_col_name] = pd.cut(votes_data, bins=4, labels=["Minimal Engagement", "Low Engagement", "Moderate Engagement", "High Engagement"])
    return df

def get_user_category_selection():
    print("Please select a product category: 'Gift Card', 'Major Appliances','Shoes', 'Electronics' or 'all'")
    category = input("Your choice: ").strip()
    if category not in ['Gift Card', 'Major Appliances', 'Shoes','Electronics', 'all']:
        print("Invalid selection. Defaulting to 'all'.")
        category = 'all'
    elif category == '':
        category = 'all'
    return category

def get_user_variable_selections():
    print("Available variables: marketplace, product_title, product_category, star_rating, vine, verified_purchase, review_headline, review_body, review_date, helpful_votes_category, total_votes_category")
    selections = input("Enter the variables you're interested in, separated by commas (e.g., product_category, star_rating): ").strip()
    if selections == '':
        return []
    return [var.strip() for var in selections.split(',')]

def display_and_select_unique_values(df, selected_variables):
    filters = {}
    for variable in selected_variables:
        if variable in df.columns:
            unique_values = df[variable].dropna().unique()
            print(f"Unique values for {variable}: {', '.join(unique_values.astype(str))}")
            selected_values = input(f"Enter the values you're interested in for {variable}, separated by commas (leave blank to skip): ")
            if selected_values:
                filters[variable] = [value.strip() for value in selected_values.split(',')]
    return filters

def filter_dataset(df, filters):
    for variable, selected_values in filters.items():
        if variable in df.columns:
            df = df[df[variable].isin(selected_values)]
    return df

if __name__ == "__main__":
    selected_category = get_user_category_selection()
    df = merge_datasets(selected_category)
    print(f"Dataset for '{selected_category}' category loaded.")
    
    df = remove_specific_columns(df)
    print("Removed specific columns: customer_id, review_id, product_id")
    
    df = modify_review_date_to_year(df)
    print("Modified 'review_date' to retain the year only.")
    
    df = categorize_votes(df, ['helpful_votes', 'total_votes'])
    print("Categorized 'helpful_votes' and 'total_votes' into engagement levels.")

    #selected_variables = get_user_variable_selections()
    #if selected_variables:
    #    filters = display_and_select_unique_values(df, selected_variables)
    #    print(f"Selected filters: {filters}")
#
    #    filtered_df = filter_dataset(df, filters)
    #    print("Filtered dataset based on your selections:")
    #    print(filtered_df.head(1))
    #else:
    #    print("No variable selections made. Displaying first entries of the dataset.")
    #    print(df.head(1))


Please select a product category: 'Gift Card', 'Major Appliances','Shoes', 'Electronics' or 'all'




  return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=False)


  return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=False)


  return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=False)


  return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=False)


  return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=False)


  return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=False)


  return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=False)


  return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=False)


Dataset for 'all' category loaded.
Removed specific columns: customer_id, review_id, product_id
Modified 'review_date' to retain the year only.
Categorized 'helpful_votes' and 'total_votes' into engagement levels.


In [13]:
df.shape

(7694941, 14)

In [14]:
df.columns

Index(['marketplace', 'product_parent', 'product_title', 'product_category',
       'star_rating', 'helpful_votes', 'total_votes', 'vine',
       'verified_purchase', 'review_headline', 'review_body', 'review_date',
       'helpful_votes_category', 'total_votes_category'],
      dtype='object')

In [17]:
processed=df
processed.to_csv('../data/extracted/merged_4_product_lines.csv')