# Input Functions

#### import Modules 

In [1]:
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from textblob import TextBlob
import re
import csv

## CHANGE PATH FOR YOU MACHINE

In [None]:
# Define the paths to the files
# gift_card_path = 'Sentiment-analysis-with-LLM/raw_data/amazon_reviews_us_Gift_Card_v1_00.tsv'
# major_appliances_path = 'Sentiment-analysis-with-LLM/raw_data/amazon_reviews_us_Major_Appliances_v1_00.tsv'

gift_card_path = 'raw_data/amazon_reviews_us_Gift_Card_v1_00.tsv'
major_appliances_path = 'raw_data/amazon_reviews_us_Major_Appliances_v1_00.tsv'
apparel_path = 'raw_data/amazon_reviews_us_Apparel_v1_00.tsv'
beauty_path = 'raw_data/amazon_reviews_us_Beauty_v1_00.tsv'
shoes_path = 'raw_data/amazon_reviews_us_Shoes_v1_00.tsv'

# Read the files into dataframes
df_gift_card = pd.read_csv(gift_card_path, sep='\t', error_bad_lines=False)
df_major_appliances = pd.read_csv(major_appliances_path, sep='\t', error_bad_lines=False)
df_apparel = pd.read_csv(apparel_path, sep='\t', error_bad_lines=False)
df_beauty = pd.read_csv(beauty_path, sep='\t', error_bad_lines=False)
df_shoes = pd.read_csv(shoes_path, sep='\t', error_bad_lines=False)

# Display the first few rows of each dataframe to verify
print("Gift Card Reviews DataFrame:")
display(df_gift_card.head(1))
print("\nMajor Appliances Reviews DataFrame:")
display(df_major_appliances.head(1))
print("Apparel Reviews DataFrame:")
display(df_apparel.head(1))
print("Beauty Reviews DataFrame:")
display(df_beauty.head(1))
print("Shoes Reviews DataFrame:")
display(df_shoes.head(1))



In [None]:
df_beauty = pd.read_csv(beauty_path, sep='\t', error_bad_lines=False)

In [None]:
print(df_beauty.columns)
print(df_beauty.shape)
df_beauty.info()
df_beauty.star_rating.unique()

In [5]:
# Define the file paths for the datasets

#gift_card_path = 'raw_data/amazon_reviews_us_Gift_Card_v1_00.tsv'
#major_appliances_path = 'raw_data/amazon_reviews_us_Major_Appliances_v1_00.tsv'

gift_card_path = 'raw_data/amazon_reviews_us_Gift_Card_v1_00.tsv'
major_appliances_path = 'raw_data/amazon_reviews_us_Major_Appliances_v1_00.tsv'
apparel_path = 'raw_data/amazon_reviews_us_Apparel_v1_00.tsv'
beauty_path = 'raw_data/amazon_reviews_us_Beauty_v1_00.tsv'
shoes_path = 'raw_data/amazon_reviews_us_Shoes_v1_00.tsv'


# Function to read a specific dataset
# def read_dataset(dataset_path):
#     return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False)

def read_dataset(dataset_path):
    return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=True)


# def read_dataset(dataset_path):
#     return pd.read_csv(
#         dataset_path, 
#         sep='\t', 
#         error_bad_lines=False, 
#         warn_bad_lines=True, 
#         quoting=csv.QUOTE_MINIMAL,  # Or csv.QUOTE_ALL if every field is quoted
#         escapechar='\\'  # Assumes backslash is used as escape character
#     )



# Function to merge datasets based on user selection
def merge_datasets(selected_category):
    if selected_category == 'Gift Card':
        return read_dataset(gift_card_path)
    elif selected_category == 'Major Appliances':
        return read_dataset(major_appliances_path)
    elif selected_category == 'Apparel':
        return read_dataset(apparel_path)
    elif selected_category == 'Beauty':
        return read_dataset(beauty_path)
    elif selected_category == 'Shoes':
        return read_dataset(shoes_path) 
    elif selected_category == 'all':
        df_gift_card = read_dataset(gift_card_path)
        df_major_appliances = read_dataset(major_appliances_path)
        return pd.concat([df_gift_card, df_major_appliances], ignore_index=True)

# Function to filter the dataset based on selected filters
def filter_dataset(df, filters):
    for variable, selected_values in filters.items():
        if variable in ['star_rating', 'total_votes']:  # Convert to integer for numeric filtering
            df = df[df[variable].astype(int).isin([int(value) for value in selected_values])]
        else:  # For strings, use the values as-is
            df = df[df[variable].isin(selected_values)]
    return df

# Ask user for the product category they're interested in
def get_user_category_selection():
    print("Please select a product category: 'Gift Card', 'Major Appliances','Apparel','Beauty','Shoes' or 'all'")
    category = input("Your choice: ").strip()
    # Default to "all" if the input is empty or not in the expected options
    if category not in ['Gift Card', 'Major Appliances','Apparel' ,'Beauty', 'Shoes','all']:
        print("Invalid selection. Defaulting to 'all'.")
        category = 'all'
    elif category == '':
        category = 'all'
    return category

# Ask user for their variable selections
def get_user_variable_selections():
    print("Available variables: marketplace, product_title, product_category, star_rating, helpful_votes, total_votes, verified_purchase, review_headline, review_body, review_date")
    selections = input("Enter the variables you're interested in, separated by commas (e.g., marketplace, product_title): ").strip()
    
    # List of all available variables
    all_variables = ['marketplace', 'product_title', 'product_category', 'star_rating', 'helpful_votes', 'total_votes', 'verified_purchase', 'review_headline', 'review_body', 'review_date']
    
    # If the user provides no input, default to all variables without further prompting for details
    if selections == '':
        return []
    
    selected_variables = [var.strip() for var in selections.split(',')]
    
    return selected_variables

# Display unique values for selected variables and collect user choices
def display_and_select_unique_values(df, selected_variables):
    # If no variables were selected (indicating the user pressed enter without selection),
    # skip prompting for details and return an empty filter set.
    if not selected_variables:
        return {}
    
    filters = {}
    for variable in selected_variables:
        unique_values = df[variable].dropna().unique()
        print(f"Unique values for {variable}: {', '.join(unique_values.astype(str))}")
        selected_values = input(f"Enter the values you're interested in for {variable}, separated by commas (leave blank to skip): ")
        if selected_values:
            filters[variable] = [value.strip() for value in selected_values.split(',')]
    return filters

# Main script
if __name__ == "__main__":
    selected_category = get_user_category_selection()
    df = merge_datasets(selected_category)
    print(f"Dataset for '{selected_category}' category loaded.")

    selected_variables = get_user_variable_selections()
    # Skip prompting for details if no variables were selected
    if selected_variables:
        filters = display_and_select_unique_values(df, selected_variables)
        print(f"Selected filters: {filters}")

        # Apply the filters to the dataset
        filtered_df = filter_dataset(df, filters)
        print("Filtered dataset based on your selections.")
        print(filtered_df.head(1))
    else:
        print("No variable selections made. Displaying first entries of the dataset.")
        print(df.head(1))
    # Display or process the filtered_df as needed


Please select a product category: 'Gift Card', 'Major Appliances','Apparel','Beauty','Shoes' or 'all'
Your choice: Shoes




  return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=True)


  return pd.read_csv(dataset_path, sep='\t', error_bad_lines=False, warn_bad_lines=True)
Skipping line 54101: expected 15 fields, saw 22
Skipping line 55857: expected 15 fields, saw 22
Skipping line 60448: expected 15 fields, saw 22

Skipping line 76918: expected 15 fields, saw 22
Skipping line 87925: expected 15 fields, saw 22
Skipping line 88500: expected 15 fields, saw 22
Skipping line 114276: expected 15 fields, saw 22
Skipping line 128751: expected 15 fields, saw 22

Skipping line 136095: expected 15 fields, saw 22
Skipping line 140007: expected 15 fields, saw 22
Skipping line 177148: expected 15 fields, saw 22
Skipping line 180087: expected 15 fields, saw 22
Skipping line 183010: expected 15 fields, saw 22
Skipping line 183949: expected 15 fields, saw 22
Skipping line 192879: expected 15 fields, saw 22

Skipping line 223261: expected 15 fields, saw 22
Skipping line 240588: expected 15 fie

Dataset for 'Shoes' category loaded.
Available variables: marketplace, product_title, product_category, star_rating, helpful_votes, total_votes, verified_purchase, review_headline, review_body, review_date
Enter the variables you're interested in, separated by commas (e.g., marketplace, product_title): star_rating, verified_purchase
Unique values for star_rating: 1, 5, 4, 3, 2
Enter the values you're interested in for star_rating, separated by commas (leave blank to skip): 5
Unique values for verified_purchase: Y, N
Enter the values you're interested in for verified_purchase, separated by commas (leave blank to skip): Y
Selected filters: {'star_rating': ['5'], 'verified_purchase': ['Y']}
Filtered dataset based on your selections.
  marketplace  customer_id       review_id  product_id  product_parent  \
1          US     16251825  R12VVR0WH5Q24V  B00CFYZH5W       259035853   

                 product_title product_category  star_rating  helpful_votes  \
1  Teva Men's Pajaro Flip-Flop  