In [1]:
print("Getting everything set up...")
print("\nWe need these libraries to create and analyze our data:")
print("numpy - Helps us with numerical operations and random number generation")
print("random - Useful for picking random elements and generating random numbers")
print("datetime - Manages dates and times for us")
print("pandas - For handling and analyzing data in a structured way")
print("textblob - Assists with text processing and sentiment analysis")

import numpy as np
import random
from datetime import datetime, timedelta  
import pandas as pd 
from textblob import TextBlob
import os
import csv

Getting everything set up...

We need these libraries to create and analyze our data:
numpy - Helps us with numerical operations and random number generation
random - Useful for picking random elements and generating random numbers
datetime - Manages dates and times for us
pandas - For handling and analyzing data in a structured way
textblob - Assists with text processing and sentiment analysis


In [2]:
print("Defining the number of feedback records to create...")
num_records = 10000
print(f"Number of records: {num_records}")

print("Generating unique customer IDs and product IDs...")
customer_ids = np.arange(10000, 11001)  # IDs for 1000 unique customers
print(f"Customer IDs range: 10000 to 11000 ({len(customer_ids)} IDs)")
product_ids = np.arange(3000, 6001)     # IDs for 3000 unique products
print(f"Product IDs range: 3000 to 6000 ({len(product_ids)} IDs)")

print("Setting the range for feedback dates...")
end_date = datetime.today()  # Today's date
start_date = end_date - timedelta(days=365)  # Date one year ago
print(f"Feedback date range: from {start_date.date()} to {end_date.date()}")

print("Defining the possible ratings (from 1 to 5)...")
ratings_distribution = np.array([1, 2, 3, 4, 5])
print(f"Possible ratings: {ratings_distribution}")

Defining the number of feedback records to create...
Number of records: 10000
Generating unique customer IDs and product IDs...
Customer IDs range: 10000 to 11000 (1001 IDs)
Product IDs range: 3000 to 6000 (3001 IDs)
Setting the range for feedback dates...
Feedback date range: from 2023-08-12 to 2024-08-11
Defining the possible ratings (from 1 to 5)...
Possible ratings: [1 2 3 4 5]


In [3]:
print("Creating an empty dictionary to store comments for each rating...")
comments = {}
csv_file_path = 'form/rating_comments.csv'  # Path to the CSV file

print(f"Opening the CSV file at '{csv_file_path}' to read comment templates...")
with open(csv_file_path, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row
    
    print("Reading comments from the file...")
    for row in reader:
        rating = int(row[0])
        comment_template = row[1].strip() 
        
        if rating in comments:
            comments[rating].append(comment_template)
        else:
            comments[rating] = [comment_template]

print("Organizing comments by rating:")
print("{")
for key in sorted(comments):
    print(f"    {key}: [")
    for comment in comments[key]:
        print(f"        '{comment}',")
    print("    ],")
print("}")

Creating an empty dictionary to store comments for each rating...
Opening the CSV file at 'form/rating_comments.csv' to read comment templates...
Reading comments from the file...
Organizing comments by rating:
{
    1: [
        'The product is {adjective} and does not meet my expectations.',
        'Service was {adjective} and unresponsive.',
        'Very {adjective} experience with the delivery.',
        'The quality of the product is {adjective}.',
        'I received a {adjective} item, and customer service was unhelpful.',
        'The product didn’t meet my expectations at all.',
        'Very {adjective} experience; will not be buying again.',
        'The service was {adjective} and {adjective}.',
        'The product broke after a few uses.',
        'Extremely {adjective} with the purchase and service.',
    ],
    2: [
        'The product is {adjective}; nothing special about it.',
        'Service was {adjective}; it was neither good nor bad.',
        'It was an {adje

In [4]:
print("Setting up a dictionary to hold adjectives for each rating...")
adjectives = {}
csv_file_path = 'form/rating_adjectives.csv' 

print(f"Opening the CSV file at '{csv_file_path}' to read adjectives...")
with open(csv_file_path, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)
    
    print("Reading adjectives from the file...")
    for row in reader:
        rating = int(row[0])
        adjective = row[1].strip()
        
        if rating in adjectives:
            adjectives[rating].append(adjective)
        else:
            adjectives[rating] = [adjective]

print("Adjectives organized by rating in CSV format:")
print("Rating,Adjective")
for key in sorted(adjectives):
    for adj in adjectives[key]:
        print(f"{key},{adj}")

Setting up a dictionary to hold adjectives for each rating...
Opening the CSV file at 'form/rating_adjectives.csv' to read adjectives...
Reading adjectives from the file...
Adjectives organized by rating in CSV format:
Rating,Adjective
1,bad
1,poor
1,disappointing
1,very bad
1,defective
1,unhappy
1,slow
1,frustrating
1,awful
1,lousy
1,subpar
1,mediocre
1,unpleasant
1,insufficient
1,substandard
1,terrible
1,worthless
1,dismal
1,ineffective
1,lamentable
1,unfortunate
1,inadequate
1,underwhelming
1,pathetic
1,deficient
1,suboptimal
1,displeasing
1,below average
1,shoddy
1,rough
1,regrettable
1,poor quality
2,average
2,mediocre
2,okay
2,acceptable
2,satisfactory
2,decent
2,typical
2,ordinary
2,fair
2,passable
2,neutral
2,moderate
2,unremarkable
2,unexceptional
2,not bad
2,run-of-the-mill
2,standard
2,commonplace
2,average quality
2,undistinguished
2,middling
2,expected
2,unimpressive
2,unobjectionable
2,normal
2,conventional
2,adequate
2,regular
2,unspectacular
2,acceptable quality
2,medio

In [5]:
print("Defining a function to generate a random date between the given start and end dates...")
def random_date(start, end):
    delta = end - start
    random_days = np.random.triangular(0, delta.days * 0.5, delta.days)
    return (start + timedelta(days=int(random_days))).date()

print("Defining a function to determine the sentiment of a comment...")
def infer_sentiment(comment):
    analysis = TextBlob(comment)
    if analysis.sentiment.polarity > 0.1:
        return "positive"  
    elif analysis.sentiment.polarity < -0.1:
        return "negative" 
    else:
        return "neutral"  

Defining a function to generate a random date between the given start and end dates...
Defining a function to determine the sentiment of a comment...


In [6]:
print("Setting up counters to track the number of each rating and sentiment...")
rating_counts = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
sentiment_counts = {"positive": 0, "negative": 0, "neutral": 0}

print("Defining our targets for how many records of each rating and sentiment we aim to generate...")
ratings_target = num_records // 5  # We want an equal number of each rating
sentiments_target = num_records // 3  # We want an equal number of each sentiment

# Adjust targets to ensure exactly num_records are generated
leftover_records = num_records - (ratings_target * 5)
if leftover_records > 0:
    ratings_target += 1

leftover_sentiments = num_records - (sentiments_target * 3)
if leftover_sentiments > 0:
    sentiments_target += 1


Setting up counters to track the number of each rating and sentiment...
Defining our targets for how many records of each rating and sentiment we aim to generate...


In [7]:
output_dir = "data"
if os.path.exists(output_dir):
    print(f"The folder '{output_dir}' already exists.")
else:
    os.mkdir(output_dir)
    print(f"The folder '{output_dir}' has been created.")

The folder 'data' already exists.


In [8]:

print("Generating records...")
records = []
record_counter = 1  # Initialize the counter

# Iterate through each rating in the distribution
for rating in ratings_distribution:
    while rating_counts[rating] < ratings_target and record_counter <= num_records:
        # Determine the next sentiment to generate based on current counts
        sentiment = 'positive' if sentiment_counts['positive'] < sentiments_target else \
                    'negative' if sentiment_counts['negative'] < sentiments_target else \
                    'neutral'
        
        # Generate the comment
        comment_template = np.random.choice(comments[rating])
        adjective = np.random.choice(adjectives[rating])
        comment = comment_template.replace("{adjective}", adjective)
        
        # Infer sentiment from the generated comment
        inferred_sentiment = infer_sentiment(comment)
        
        # Only add the record if it balances the sentiment distribution
        if sentiment_counts[inferred_sentiment] < sentiments_target:
            customer_id = np.random.choice(customer_ids)
            product_id = np.random.choice(product_ids)
            feedback_date = random_date(start_date, end_date)
            records.append([customer_id, product_id, feedback_date, rating, comment, inferred_sentiment])
            rating_counts[rating] += 1
            sentiment_counts[inferred_sentiment] += 1
         
        # Break out of the loop if num_records has been generated
        if record_counter > num_records:
            break

# Shuffle records to randomize their order
print("\nShuffling records...")
np.random.shuffle(records)


Generating records...

Shuffling records...


In [9]:

# Create DataFrame and write to CSV
output_file_path = 'data/customer_feedback_dataset.csv'
print(f"Writing records to '{output_file_path}'...")
df = pd.DataFrame(records, columns=['customer_id', 'product_id', 'feedback_date', 'rating', 'comment', 'sentiment'])
df.to_csv(output_file_path, index=False, encoding='utf-8')

print("Data generation complete!")

# Checking distribution of sentiments
print("Checking distribution of sentiments...")
print({sent: sentiment_counts[sent] for sent in ['positive', 'negative', 'neutral']})

# Checking distribution of ratings
print("Checking distribution of ratings...")
print({rating: sum(1 for rec in records if rec[3] == rating) for rating in range(1, 6)})

Writing records to 'data/customer_feedback_dataset.csv'...
Data generation complete!
Checking distribution of sentiments...
{'positive': 3334, 'negative': 3332, 'neutral': 3334}
Checking distribution of ratings...
{1: 2000, 2: 2000, 3: 2000, 4: 2000, 5: 2000}
