<a href="https://colab.research.google.com/github/Ranger3560/SentimentAnalysis/blob/main/Futuristic_Enhanced_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install faker pandas plotly

# Import Libraries
import random
import pandas as pd
from datetime import datetime, timedelta
from faker import Faker
import uuid # For UserID

# Initialize Faker for realistic text generation
fake = Faker()

# Set random seed for reproducibility
random.seed(42)

# Constants from sample data
PRODUCT_ID = "B08T68ZMGL"

# Review proportions for rating generation
POSITIVE_PCT = 0.73  # 4-5 stars
NEGATIVE_PCT = 0.18  # 1-2 stars
NEUTRAL_PCT = 0.09   # 3 stars
VERIFIED_PCT = 0.82  # Verified purchases

# Date range
START_DATE = datetime(2021, 1, 1)
END_DATE = datetime(2025, 12, 31)

# Plant types for more realistic reviews
PLANTS = [
    "rose plants", "hibiscus", "vegetables", "fruit trees", "indoor plants",
    "terrace garden", "flowering plants", "bonsai", "lawn", "orchids",
    "succulents", "herb garden", "money plants", "tulsi", "marigold"
]

# Positive review components
POSITIVE_BASE = [
    "Excellent results with my {plant} - saw improvement in just {days} days",
    "My {plant} are thriving with this product, {result}",
    "Wonderful product! My {plant} {improvement}",
    "Visible difference in my {plant} after {weeks} weeks",
    "{plant} responded beautifully to this treatment",
    "This product saved my dying {plant} - now they're {result}",
    "Significant improvement in {plant} health and {aspect}",
    "My {plant} have never looked better - {result}",
    "Works exactly as described on my {plant} - {result}",
    "The {plant} in my garden are now {result} thanks to this product",
    "My plants flower abundantly. Thanks!"
]

POSITIVE_RESULTS = [
    "producing more flowers", "growing vigorously", "looking healthier",
    "fruiting abundantly", "greener than ever", "pest-free now",
    "blooming continuously", "recovered completely", "growing faster",
    "more vibrant colors", "100% guarantee"
]

# Negative review components
NEGATIVE_BASE = [
    "No improvement in my {plant} even after {weeks} weeks",
    "Complete waste of money for my {plant} - {issue}",
    "Did nothing for my {plant}, very disappointed",
    "False claims - my {plant} show no difference",
    "Expected better results with my {plant} but {issue}",
    "Overpriced and ineffective for {plant}",
    "My {plant} actually got worse after using this",
    "No visible change in my {plant} condition",
    "Would not recommend for {plant} - {issue}",
    "This product damaged my {plant} - now they're {issue}",
    "The product is duplicate, waste of money."
]

NEGATIVE_ISSUES = [
    "still struggling", "showing no growth", "leaves turning yellow",
    "flowers stopped coming", "showing signs of stress", "not responding",
    "getting infected", "growth stunted", "dropping leaves","Waiting for results",
    "not recovering", "damaged product", "Cheating, no use", "Where is my money?"
]

# Neutral review components
NEUTRAL_BASE = [
    "Some improvement in my {plant} but expected more",
    "Moderate results with my {plant} - {observation}",
    "My {plant} are doing okay but not exceptional",
    "Takes too long to show results on {plant}",
    "Average performance with my {plant} - {observation}",
    "Not bad but not great for my {plant}",
    "My {plant} showed {result} but it took {time}",
    "Works somewhat for {plant} but needs improvement"
]

NEUTRAL_OBSERVATIONS = [
    "slight growth", "some new leaves", "a few flowers",
    "minor improvement", "little change", "slow progress",
    "inconsistent results", "temporary effect"
]

# New Constants for Additional Columns
MARKETING_CAMPAIGNS = [
    "Farmer Connect Program", "Monsoon Harvest Boost 2024",
    "Organic Growth Initiative", "Pest Control Solutions Drive",
    "YieldMax Pro Launch", "Sustainable Farming Initiative",
    "Digital Agriculture Summit"
]

CHANNELS = [
    "Direct Traffic", "Social Media", "Search Ads", "Referral",
    "Email Marketing", "Display Ads", "Organic Search"
]

CUSTOMER_SEGMENTS = [
    "Large Farm - New", "Large Farm - Returning",
    "Retailer - Tier 1", "Retailer - Tier 2",
    "Small Farm - New", "Small Farm - Returning"
]

PRODUCT_CATEGORIES = [
    "Herbicides", "Digital Tools", "Seeds", "Fertilizers",
    "Pesticides", "Crop Protection", "Farm Equipment"
]

# Helper Functions for New Columns

# Generate random datetime within range (including time)
def random_datetime(start, end):
    delta = end - start
    random_days = random.randrange(delta.days)
    random_seconds = random.randrange(24 * 60 * 60) # Random seconds in a day
    return start + timedelta(days=random_days, seconds=random_seconds)

# Generate matching title based on review content
def generate_title(review_text, rating):
    if rating >= 4:
        if any(word in review_text.lower() for word in ["excellent", "wonderful", "thriving", "saved"]):
            return random.choice(["Amazing results!", "Life saver for plants", "Highly recommended"])
        elif "improvement" in review_text.lower():
            return random.choice(["Visible improvement", "Works well", "Good product"])
        else:
            return random.choice(["Great buy", "Happy with purchase", "Effective product"])
    elif rating <= 2:
        if "waste" in review_text.lower() or "false" in review_text.lower():
            return random.choice(["Total waste", "False claims", "Don't buy"])
        elif "damage" in review_text.lower():
            return random.choice(["Damaged my plants", "Harmful product"])
        else:
            return random.choice(["No results", "Disappointed", "Not effective"])
    else:
        return random.choice(["Average product", "Okay results", "Moderate effect"])

# Generate review text based on rating
def generate_review_text(rating):
    plant = random.choice(PLANTS)
    if rating >= 4:
        template = random.choice(POSITIVE_BASE)
        days = random.randint(7, 21)
        weeks = random.randint(2, 4)
        result = random.choice(POSITIVE_RESULTS)
        aspect = random.choice(["growth", "flowering", "leaf color", "fruit production"])
        return template.format(
            plant=plant,
            days=days,
            weeks=weeks,
            result=result,
            aspect=aspect,
            improvement=result
        )
    elif rating <= 2:
        template = random.choice(NEGATIVE_BASE)
        weeks = random.randint(3, 6)
        issue = random.choice(NEGATIVE_ISSUES)
        return template.format(
            plant=plant,
            weeks=weeks,
            issue=issue
        )
    else:
        template = random.choice(NEUTRAL_BASE)
        time_str = random.choice(["4 weeks", "a month", "several applications"])
        observation = random.choice(NEUTRAL_OBSERVATIONS)
        result = random.choice(["some improvement", "slight change", "moderate growth"])
        return template.format(
            plant=plant,
            observation=observation,
            result=result,
            time=time_str # Use time_str here
        )

# Generate Review Reaction based on Rating
def generate_review_reaction(rating):
    if rating == 5:
        return random.randint(80, 100)
    elif rating == 4:
        return random.randint(60, 80)
    elif rating == 3:
        return random.randint(40, 60)
    elif rating == 2:
        return random.randint(20, 40)
    else: # Rating 1
        return random.randint(0, 20)

# Generate Order Value based on Customer Segment
def generate_order_value(customer_segment):
    if "Large Farm" in customer_segment:
        return round(random.uniform(3000, 6000), 2)
    elif "Retailer" in customer_segment:
        return round(random.uniform(1000, 4000), 2)
    else: # Small Farm
        return round(random.uniform(500, 2500), 2)

# Generate Impressions and Clicks
def generate_impressions_clicks():
    impressions = random.randint(10000, 100000)
    clicks = random.randint(int(impressions * 0.005), int(impressions * 0.1)) # Clicks 0.5% to 10% of impressions
    return impressions, clicks

# Generate Sentiment Label and Score based on Rating
def generate_sentiment(rating):
    if rating >= 4:
        return "POSITIVE", round(random.uniform(0.7, 1.0), 9)
    elif rating <= 2:
        return "NEGATIVE", round(random.uniform(0.0, 0.3), 9)
    else: # Rating 3
        return "NEUTRAL", round(random.uniform(0.4, 0.6), 9)

# Main function to generate reviews
def generate_reviews(num_reviews=1000):
    reviews = []
    for i in range(num_reviews):
        # Determine review type and rating
        review_type_prob = random.random()
        if review_type_prob < POSITIVE_PCT:
            rating = random.choice([4, 5])
        elif review_type_prob < POSITIVE_PCT + NEGATIVE_PCT:
            rating = random.choice([1, 2])
        else:
            rating = 3

        # Generate core review data
        review_text = generate_review_text(rating)
        # Title is now generated after review_text, as it depends on it
        title = generate_title(review_text, rating)
        verified = random.random() < VERIFIED_PCT
        review_id = "R" + ''.join(random.choices("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ", k=12))

        # Generate date and time using the new random_datetime function
        review_datetime_obj = random_datetime(START_DATE, END_DATE)
        date = review_datetime_obj.strftime("%d-%m-%Y")
        time = review_datetime_obj.strftime("%H:%M:%S")

        # Generate new columns
        review_reaction = generate_review_reaction(rating)
        user_id = str(uuid.uuid4())
        marketing_campaign = random.choice(MARKETING_CAMPAIGNS)
        customer_segment = random.choice(CUSTOMER_SEGMENTS)
        order_value = generate_order_value(customer_segment)
        impressions, clicks = generate_impressions_clicks()
        conversion_rate = round(clicks / impressions, 4) if impressions > 0 else 0.0
        channel = random.choice(CHANNELS)
        product_category = random.choice(PRODUCT_CATEGORIES)
        sentiment_label, sentiment_score = generate_sentiment(rating)

        # Create review row
        review = [
            date, time, verified, PRODUCT_ID, rating, title,
            review_text, review_id, review_reaction, user_id, marketing_campaign,
            order_value, impressions, clicks, conversion_rate,
            channel, customer_segment, product_category,
            sentiment_label, sentiment_score
        ]
        reviews.append(review)

    return reviews

# Generate 1000 reviews
num_records = 1000 # You can adjust the number of records
df = pd.DataFrame(generate_reviews(num_records))

# Define columns in the correct order as requested, including "Title"
columns = [
    "Date", "Time", "Verified", "ProductID", "Rating", "Title",
    "ReviewText", "ReviewID", "ReviewReaction", "UserID",
    "MarketingCampaign", "OrderValue", "Impressions", "Clicks",
    "ConversionRate", "Channel", "CustomerSegment", "ProductCategory",
    "SentimentLabel", "SentimentScore"
]
df.columns = columns # Assign column names

# Save to CSV
df.to_csv("Corteva_Extrapolated_Dataset_Full.csv", index=False)

# Display sample
print("Sample of generated data:")
display(df.head(20))

# Download link (for Google Colab environment)
try:
    from google.colab import files
    files.download("Corteva_Extrapolated_Dataset_Full.csv")
except ImportError:
    print("\n'google.colab.files' not found. Skipping download link.")
    print("The file 'Corteva_Extrapolated_Dataset_Full.csv' has been saved locally.")


Sample of generated data:


Unnamed: 0,Date,Time,Verified,ProductID,Rating,Title,ReviewText,ReviewID,ReviewReaction,UserID,MarketingCampaign,OrderValue,Impressions,Clicks,ConversionRate,Channel,CustomerSegment,ProductCategory,SentimentLabel,SentimentScore
0,16-11-2025,00:14:11,True,B08T68ZMGL,4,Effective product,herb garden responded beautifully to this trea...,RJL138LKPPFGA,65,11608ecf-3d4d-49e7-8d26-f038b84bd883,Sustainable Farming Initiative,2020.75,30379,1032,0.034,Organic Search,Retailer - Tier 2,Seeds,POSITIVE,0.730663
1,10-11-2025,03:40:38,True,B08T68ZMGL,5,Highly recommended,This product saved my dying tulsi - now they'r...,RX2AMVD62NRZU,92,23b9bd5a-28cc-4248-b090-b32e1e5621be,Organic Growth Initiative,2907.05,57819,1621,0.028,Search Ads,Retailer - Tier 2,Seeds,POSITIVE,0.762852
2,30-11-2025,11:27:25,True,B08T68ZMGL,4,Happy with purchase,My plants flower abundantly. Thanks!,RZNKOUR81B97X,66,15300571-0a9d-4096-b808-7317cc9200ec,Sustainable Farming Initiative,2186.9,94259,7988,0.0847,Social Media,Retailer - Tier 2,Seeds,POSITIVE,0.741889
3,30-05-2022,20:08:32,True,B08T68ZMGL,2,Harmful product,This product damaged my herb garden - now they...,RZ4HRU55OLDLG,20,72e278f9-3f05-43cd-9933-bf4b6e9d60bc,Sustainable Farming Initiative,729.1,80381,6552,0.0815,Search Ads,Small Farm - Returning,Farm Equipment,NEGATIVE,0.192289
4,14-01-2023,11:11:46,False,B08T68ZMGL,5,Effective product,My vegetables have never looked better - green...,RVANL5RJSJ0B0,87,d81920a9-71ad-435a-9b2e-11dfc842083c,Farmer Connect Program,5634.03,20322,276,0.0136,Display Ads,Large Farm - Returning,Fertilizers,POSITIVE,0.944807
5,06-04-2022,02:19:52,True,B08T68ZMGL,3,Average product,My lawn are doing okay but not exceptional,RVF7JQ7BZNFI4,50,c47f5d41-d539-48c1-9a2d-aef379be0222,Farmer Connect Program,1607.84,87128,4043,0.0464,Direct Traffic,Small Farm - New,Herbicides,NEUTRAL,0.541568
6,02-02-2025,01:58:20,True,B08T68ZMGL,4,Life saver for plants,Excellent results with my marigold - saw impro...,R74XKHST63FFG,80,bbd8fdb3-18aa-418b-9d92-29ec3f99b655,Sustainable Farming Initiative,3181.84,54473,1167,0.0214,Social Media,Large Farm - New,Digital Tools,POSITIVE,0.757063
7,11-03-2023,09:39:20,True,B08T68ZMGL,5,Effective product,vegetables responded beautifully to this treat...,RNJ03R8EHVW50,94,adceba01-ca52-4d64-9c94-26054fb8a529,Organic Growth Initiative,3089.77,82845,5836,0.0704,Display Ads,Retailer - Tier 2,Fertilizers,POSITIVE,0.746439
8,02-05-2024,01:26:49,True,B08T68ZMGL,4,Great buy,Works exactly as described on my orchids - pes...,RIU5Y26LO84W8,79,99a60918-64c9-4740-ae7f-fc562bd3dd6e,Farmer Connect Program,2972.12,84085,4702,0.0559,Search Ads,Retailer - Tier 2,Seeds,POSITIVE,0.761278
9,20-11-2021,15:57:13,True,B08T68ZMGL,4,Works well,Significant improvement in indoor plants healt...,RR2GZZ279XVVD,77,15672765-2a35-4284-a263-2577c7bd53f4,Sustainable Farming Initiative,2835.03,95717,9144,0.0955,Direct Traffic,Retailer - Tier 1,Crop Protection,POSITIVE,0.945131


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>