In [9]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
import nltk

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Updated file paths
file_paths = [
    '/content/Amazon REVIEW.csv',
    '/content/HelloFresh REVIEW.csv',
    '/content/new_retail_data (1).csv'
]

# Step 1: Load all datasets
datasets = []
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path)
        print(f"Columns in {file_path}: {list(df.columns)}")
        datasets.append(df)
        print(f"Successfully loaded {file_path}")
    except FileNotFoundError as e:
        print(f"File not found: {file_path}")
        print(e)
    except Exception as e:
        print(f"Error loading file: {file_path}")
        print(e)

# Step 2: Standardize column names
for i, df in enumerate(datasets):
    # Standardize the review column
    if 'Review' in df.columns:
        df.rename(columns={'Review': 'Review_Text'}, inplace=True)
    elif 'Comments' in df.columns:
        df.rename(columns={'Comments': 'Review_Text'}, inplace=True)
    else:
        print(f"No review column found in dataset {i + 1}. Check the column names.")
        df['Review_Text'] = ""  # Create an empty Review_Text column if missing

# Step 3: Combine datasets
combined_data = pd.concat(datasets, ignore_index=True)

# Step 4: Handle missing values
combined_data['Review_Text'] = combined_data['Review_Text'].fillna("")

# Step 5: Save the cleaned data
cleaned_file_path = '/content/Cleaned_Combined_Reviews.csv'
combined_data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned data saved to {cleaned_file_path}")
print(f"Total rows after cleaning: {len(combined_data)}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Columns in /content/Amazon REVIEW.csv: ['Unnamed: 0', 'Title', 'Review', 'Stars', 'Date_of_Experience']
Successfully loaded /content/Amazon REVIEW.csv
Columns in /content/HelloFresh REVIEW.csv: ['Unnamed: 0', 'Title', 'Review', 'Stars']
Successfully loaded /content/HelloFresh REVIEW.csv
Columns in /content/new_retail_data (1).csv: ['Transaction_ID', 'Customer_ID', 'Name', 'Email', 'Phone', 'Address', 'City', 'State', 'Zipcode', 'Country', 'Age', 'Gender', 'Income', 'Customer_Segment', 'Date', 'Year', 'Month', 'Time', 'Total_Purchases', 'Amount', 'Total_Amount', 'Product_Category', 'Product_Brand', 'Product_Type', 'Feedback', 'Shipping_Method', 'Payment_Method', 'Order_Status', 'Ratings', 'products']
Successfully loaded /content/new_retail_data (1).csv
No review column found in dataset 3. Check the column names.
Cleaned data saved to /content/Cleaned_Combined_Reviews.csv
Total rows after cleaning: 310050
