<a href="https://colab.research.google.com/github/Ranger3560/SentimentAnalysis/blob/main/Data_Extrapolation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Extrapolation of Corteva Sample Dataset

# Install Required Packages
!pip install faker

#Import Libraries
import random
import pandas as pd
from datetime import datetime, timedelta
from faker import Faker

# Initialize Faker for realistic text generation
fake = Faker()

# Set random seed for reproducibility
random.seed(42)

# Constants from sample data
PRODUCT_ID = "B08T68ZMGL"

# Review proportions
POSITIVE_PCT = 0.73  # 4-5 stars
NEGATIVE_PCT = 0.18  # 1-2 stars
NEUTRAL_PCT = 0.09   # 3 stars
VERIFIED_PCT = 0.82  # Verified purchases

# Date range
START_DATE = datetime(2022, 1, 1)
END_DATE = datetime(2025, 12, 31)

# Plant types for more realistic reviews
PLANTS = [
    "rose plants", "hibiscus", "vegetables", "fruit trees", "indoor plants",
    "terrace garden", "flowering plants", "bonsai", "lawn", "orchids",
    "succulents", "herb garden", "money plants", "tulsi", "marigold"
]

# Positive review components
POSITIVE_BASE = [
    "Excellent results with my {plant} - saw improvement in just {days} days",
    "My {plant} are thriving with this product, {result}",
    "Wonderful product! My {plant} {improvement}",
    "Visible difference in my {plant} after {weeks} weeks",
    "{plant} responded beautifully to this treatment",
    "This product saved my dying {plant} - now they're {result}",
    "Significant improvement in {plant} health and {aspect}",
    "My {plant} have never looked better - {result}",
    "Works exactly as described on my {plant} - {result}",
    "The {plant} in my garden are now {result} thanks to this product",
    "My plants flower abundantly. Thanks!"

]

POSITIVE_RESULTS = [
    "producing more flowers", "growing vigorously", "looking healthier",
    "fruiting abundantly", "greener than ever", "pest-free now",
    "blooming continuously", "recovered completely", "growing faster",
    "more vibrant colors", "100% guarantee"
]

# Negative review components
NEGATIVE_BASE = [
    "No improvement in my {plant} even after {weeks} weeks",
    "Complete waste of money for my {plant} - {issue}",
    "Did nothing for my {plant}, very disappointed",
    "False claims - my {plant} show no difference",
    "Expected better results with my {plant} but {issue}",
    "Overpriced and ineffective for {plant}",
    "My {plant} actually got worse after using this",
    "No visible change in my {plant} condition",
    "Would not recommend for {plant} - {issue}",
    "This product damaged my {plant} - now they're {issue}",
    "The product is duplicate, waste of money."
]

NEGATIVE_ISSUES = [
    "still struggling", "showing no growth", "leaves turning yellow",
    "flowers stopped coming", "showing signs of stress", "not responding",
    "getting infected", "growth stunted", "dropping leaves","Waiting for results",
    "not recovering", "damaged product", "Cheating, no use", "Where is my money?"
]

# Neutral review components
NEUTRAL_BASE = [
    "Some improvement in my {plant} but expected more",
    "Moderate results with my {plant} - {observation}",
    "My {plant} are doing okay but not exceptional",
    "Takes too long to show results on {plant}",
    "Average performance with my {plant} - {observation}",
    "Not bad but not great for my {plant}",
    "My {plant} showed {result} but it took {time}",
    "Works somewhat for {plant} but needs improvement"
]

NEUTRAL_OBSERVATIONS = [
    "slight growth", "some new leaves", "a few flowers",
    "minor improvement", "little change", "slow progress",
    "inconsistent results", "temporary effect"
]

# Generate random dates within range
def random_date(start, end):
    delta = end - start
    random_days = random.randrange(delta.days)
    return start + timedelta(days=random_days)

# Generate matching title based on review content
def generate_title(review_text, rating):
    if rating >= 4:
        if any(word in review_text.lower() for word in ["excellent", "wonderful", "thriving", "saved"]):
            return random.choice(["Amazing results!", "Life saver for plants", "Highly recommended"])
        elif "improvement" in review_text.lower():
            return random.choice(["Visible improvement", "Works well", "Good product"])
        else:
            return random.choice(["Great buy", "Happy with purchase", "Effective product"])
    elif rating <= 2:
        if "waste" in review_text.lower() or "false" in review_text.lower():
            return random.choice(["Total waste", "False claims", "Don't buy"])
        elif "damage" in review_text.lower():
            return random.choice(["Damaged my plants", "Harmful product"])
        else:
            return random.choice(["No results", "Disappointed", "Not effective"])
    else:
        return random.choice(["Average product", "Okay results", "Moderate effect"])

# Generate review text based on rating
def generate_review_text(rating):
    plant = random.choice(PLANTS)
    if rating >= 4:
        template = random.choice(POSITIVE_BASE)
        days = random.randint(7, 21)
        weeks = random.randint(2, 4)
        result = random.choice(POSITIVE_RESULTS)
        aspect = random.choice(["growth", "flowering", "leaf color", "fruit production"])
        return template.format(
            plant=plant,
            days=days,
            weeks=weeks,
            result=result,
            aspect=aspect,
            improvement=result
        )
    elif rating <= 2:
        template = random.choice(NEGATIVE_BASE)
        weeks = random.randint(3, 6)
        issue = random.choice(NEGATIVE_ISSUES)
        return template.format(
            plant=plant,
            weeks=weeks,
            issue=issue
        )
    else:
        template = random.choice(NEUTRAL_BASE)
        time = random.choice(["4 weeks", "a month", "several applications"])
        observation = random.choice(NEUTRAL_OBSERVATIONS)
        result = random.choice(["some improvement", "slight change", "moderate growth"])
        return template.format(
            plant=plant,
            observation=observation,
            result=result,
            time=time
        )

# Generate review data
def generate_reviews(num_reviews=1000):
    reviews = []
    for i in range(num_reviews):
        # Determine review type
        review_type = random.random()
        if review_type < POSITIVE_PCT:
            rating = random.choice([4, 5])
        elif review_type < POSITIVE_PCT + NEGATIVE_PCT:
            rating = random.choice([1, 2])
        else:
            rating = 3

        # Generate review text
        review_text = generate_review_text(rating)

        # Generate matching title
        title = generate_title(review_text, rating)

        # Determine verified status
        verified = random.random() < VERIFIED_PCT

        # Generate random date
        date = random_date(START_DATE, END_DATE).strftime("%d-%m-%Y")

        # Generate random review ID
        review_id = "R" + ''.join(random.choices("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ", k=12))

        # Create review row
        review = [
            date,
            str(verified).upper(),
            PRODUCT_ID,
            str(rating),
            title,
            review_text,
            review_id
        ]

        reviews.append(review)

    return reviews

# Generate 1000 reviews
reviews = generate_reviews(1000)

# Create DataFrame
columns = [
    "Date", "Verified", "ProductID", "Rating", "Title", "ReviewText",
    "ReviewID"
]
df = pd.DataFrame(reviews, columns=columns)

# Save to CSV
df.to_csv("Corteva_Extrapolated_Dataset.csv", index=False)

# Display sample
print("Sample of generated data:")
display(df.head(20))

# Download link
from google.colab import files
files.download("Corteva_Extrapolated_Dataset.csv")

Sample of generated data:


Unnamed: 0,Date,Verified,ProductID,Rating,Title,ReviewText,ReviewID
0,21-01-2025,True,B08T68ZMGL,4,Effective product,herb garden responded beautifully to this trea...,R3F17I07NJ7LT
1,18-02-2024,True,B08T68ZMGL,4,Works well,Significant improvement in herb garden health ...,R3ULTQJZDJTMV
2,17-07-2024,True,B08T68ZMGL,4,Great buy,My plants flower abundantly. Thanks!,RMDD79XNL6Q5D
3,09-10-2023,False,B08T68ZMGL,3,Average product,Takes too long to show results on succulents,RE2WKP7HVN55Q
4,07-04-2022,True,B08T68ZMGL,5,Effective product,The marigold in my garden are now fruiting abu...,RU55OLDLG9JXO
5,24-06-2023,True,B08T68ZMGL,5,Great buy,My plants flower abundantly. Thanks!,RYRI3MUI7D5YX
6,22-09-2023,False,B08T68ZMGL,3,Okay results,Not bad but not great for my orchids,R88K2QTZJ4NY5
7,23-08-2025,False,B08T68ZMGL,5,Happy with purchase,Works exactly as described on my fruit trees -...,RDWG820JL0P22
8,15-03-2025,True,B08T68ZMGL,5,Great buy,Works exactly as described on my hibiscus - 10...,RK8HE3NCEV1NN
9,25-07-2023,True,B08T68ZMGL,5,Amazing results!,My money plants are thriving with this product...,RGV2TU3NJ03R8


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [3]:
# Extrapolation of Syngenta Sample Dataset

# Install Required Packages
!pip install faker

#Import Libraries
import random
import pandas as pd
from datetime import datetime, timedelta
from faker import Faker

# Initialize Faker for realistic text generation
fake = Faker()

# Set random seed for reproducibility
random.seed(42)

# Constants from sample data
PRODUCT_ID = "B08NK5Y8QM"

# Review proportions
POSITIVE_PCT = 0.40  # 4-5 stars
NEGATIVE_PCT = 0.38  # 1-2 stars
NEUTRAL_PCT = 0.22   # 3 stars
VERIFIED_PCT = 0.90  # Verified purchases

# Date range
START_DATE = datetime(2022, 1, 1)
END_DATE = datetime(2025, 12, 31)

# Plant types for more realistic reviews
PLANTS = [
    "rose plants", "hibiscus", "vegetables", "fruit trees", "indoor plants",
    "terrace garden", "flowering plants", "bonsai", "lawn", "orchids",
    "succulents", "herb garden", "money plants", "tulsi", "marigold"
]

# Positive review components
POSITIVE_BASE = [
    "Excellent results with my {plant} - saw improvement in just {days} days",
    "My {plant} are thriving with this product, {result}",
    "Wonderful product! My {plant} {improvement}",
    "Visible difference in my {plant} after {weeks} weeks",
    "{plant} responded beautifully to this treatment",
    "This product saved my dying {plant} - now they're {result}",
    "Significant improvement in {plant} health and {aspect}",
    "My {plant} have never looked better - {result}",
    "Works exactly as described on my {plant} - {result}",
    "The {plant} in my garden are now {result} thanks to this product",
    "My plants flower abundantly. Thanks!"

]

POSITIVE_RESULTS = [
    "producing more flowers", "growing vigorously", "looking healthier",
    "fruiting abundantly", "greener than ever", "pest-free now",
    "blooming continuously", "recovered completely", "growing faster",
    "more vibrant colors", "100% guarantee"
]

# Negative review components
NEGATIVE_BASE = [
    "No improvement in my {plant} even after {weeks} weeks",
    "Complete waste of money for my {plant} - {issue}",
    "Did nothing for my {plant}, very disappointed",
    "False claims - my {plant} show no difference",
    "Expected better results with my {plant} but {issue}",
    "Overpriced and ineffective for {plant}",
    "My {plant} actually got worse after using this",
    "No visible change in my {plant} condition",
    "Would not recommend for {plant} - {issue}",
    "This product damaged my {plant} - now they're {issue}",
    "The product is duplicate, waste of money."
]

NEGATIVE_ISSUES = [
    "still struggling", "showing no growth", "leaves turning yellow",
    "flowers stopped coming", "showing signs of stress", "not responding",
    "getting infected", "growth stunted", "dropping leaves","Waiting for results",
    "not recovering", "damaged product", "Cheating, no use", "Where is my money?"
]

# Neutral review components
NEUTRAL_BASE = [
    "Some improvement in my {plant} but expected more",
    "Moderate results with my {plant} - {observation}",
    "My {plant} are doing okay but not exceptional",
    "Takes too long to show results on {plant}",
    "Average performance with my {plant} - {observation}",
    "Not bad but not great for my {plant}",
    "My {plant} showed {result} but it took {time}",
    "Works somewhat for {plant} but needs improvement"
]

NEUTRAL_OBSERVATIONS = [
    "slight growth", "some new leaves", "a few flowers",
    "minor improvement", "little change", "slow progress",
    "inconsistent results", "temporary effect"
]

# Generate random dates within range
def random_date(start, end):
    delta = end - start
    random_days = random.randrange(delta.days)
    return start + timedelta(days=random_days)

# Generate matching title based on review content
def generate_title(review_text, rating):
    if rating >= 4:
        if any(word in review_text.lower() for word in ["excellent", "wonderful", "thriving", "saved"]):
            return random.choice(["Amazing results!", "Life saver for plants", "Highly recommended"])
        elif "improvement" in review_text.lower():
            return random.choice(["Visible improvement", "Works well", "Good product"])
        else:
            return random.choice(["Great buy", "Happy with purchase", "Effective product"])
    elif rating <= 2:
        if "waste" in review_text.lower() or "false" in review_text.lower():
            return random.choice(["Total waste", "False claims", "Don't buy"])
        elif "damage" in review_text.lower():
            return random.choice(["Damaged my plants", "Harmful product"])
        else:
            return random.choice(["No results", "Disappointed", "Not effective"])
    else:
        return random.choice(["Average product", "Okay results", "Moderate effect"])

# Generate review text based on rating
def generate_review_text(rating):
    plant = random.choice(PLANTS)
    if rating >= 4:
        template = random.choice(POSITIVE_BASE)
        days = random.randint(7, 21)
        weeks = random.randint(2, 4)
        result = random.choice(POSITIVE_RESULTS)
        aspect = random.choice(["growth", "flowering", "leaf color", "fruit production"])
        return template.format(
            plant=plant,
            days=days,
            weeks=weeks,
            result=result,
            aspect=aspect,
            improvement=result
        )
    elif rating <= 2:
        template = random.choice(NEGATIVE_BASE)
        weeks = random.randint(3, 6)
        issue = random.choice(NEGATIVE_ISSUES)
        return template.format(
            plant=plant,
            weeks=weeks,
            issue=issue
        )
    else:
        template = random.choice(NEUTRAL_BASE)
        time = random.choice(["4 weeks", "a month", "several applications"])
        observation = random.choice(NEUTRAL_OBSERVATIONS)
        result = random.choice(["some improvement", "slight change", "moderate growth"])
        return template.format(
            plant=plant,
            observation=observation,
            result=result,
            time=time
        )

# Generate review data
def generate_reviews(num_reviews=1000):
    reviews = []
    for i in range(num_reviews):
        # Determine review type
        review_type = random.random()
        if review_type < POSITIVE_PCT:
            rating = random.choice([4, 5])
        elif review_type < POSITIVE_PCT + NEGATIVE_PCT:
            rating = random.choice([1, 2])
        else:
            rating = 3

        # Generate review text
        review_text = generate_review_text(rating)

        # Generate matching title
        title = generate_title(review_text, rating)

        # Determine verified status
        verified = random.random() < VERIFIED_PCT

        # Generate random date
        date = random_date(START_DATE, END_DATE).strftime("%d-%m-%Y")

        # Generate random review ID
        review_id = "R" + ''.join(random.choices("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ", k=12))

        # Create review row
        review = [
            date,
            str(verified).upper(),
            PRODUCT_ID,
            str(rating),
            title,
            review_text,
            review_id
        ]

        reviews.append(review)

    return reviews

# Generate 1000 reviews
reviews = generate_reviews(1000)

# Create DataFrame
columns = [
    "Date", "Verified", "ProductID", "Rating", "Title", "ReviewText",
    "ReviewID"
]
df = pd.DataFrame(reviews, columns=columns)

# Save to CSV
df.to_csv("Syngenta_Extrapolated_Dataset.csv", index=False)

# Display sample
print("Sample of generated data:")
display(df.head(20))

# Download link
from google.colab import files
files.download("Syngenta_Extrapolated_Dataset.csv")

Sample of generated data:


Unnamed: 0,Date,Verified,ProductID,Rating,Title,ReviewText,ReviewID
0,17-10-2025,True,B08NK5Y8QM,1,No results,Expected better results with my herb garden bu...,RQJL138LKPPFG
1,21-11-2023,True,B08NK5Y8QM,4,Amazing results!,Wonderful product! My money plants pest-free now,R3DCC91G4X2AM
2,16-08-2023,True,B08NK5Y8QM,3,Moderate effect,Takes too long to show results on terrace garden,RZUVDGU5COPO2
3,26-03-2023,True,B08NK5Y8QM,1,Not effective,No visible change in my vegetables condition,ROUR81B97XVBN
4,10-04-2025,True,B08NK5Y8QM,5,Effective product,vegetables responded beautifully to this treat...,RE7ZI313MSF2D
5,04-01-2025,True,B08NK5Y8QM,3,Moderate effect,Average performance with my lawn - slight growth,RRRCA50PPYRI3
6,11-05-2025,False,B08NK5Y8QM,1,Not effective,Overpriced and ineffective for vegetables,RB0XVTB2VY3H2
7,27-05-2025,True,B08NK5Y8QM,1,Total waste,"The product is duplicate, waste of money.",RF7JQ7BZNFI48
8,06-03-2022,True,B08NK5Y8QM,4,Great buy,Visible difference in my orchids after 2 weeks,RU28O74XKHST6
9,26-11-2023,True,B08NK5Y8QM,5,Visible improvement,Significant improvement in terrace garden heal...,RS36J56GV2TU3


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>