In [2]:
import pandas as pd
import random

positive = [
    "Excellent service and fast delivery.",
    "Very satisfied with the product quality.",
    "Great support team, solved my issue quickly.",
    "Loved the packaging and product performance.",
    "Happy with the purchase, will buy again."
]
negative = [
    "Delivery was delayed and the product arrived damaged.",
    "Very disappointed with the customer service.",
    "Product quality is poor, not as described.",
    "I want a refund, the device stopped working in a week.",
    "Wrong item delivered and support is unresponsive."
]
neutral = [
    "Product is okay, nothing exceptional.",
    "Average experience, expected better for the price.",
    "Received the order, will test soon.",
    "Not sure about the features yet.",
    "It is acceptable for daily use."
]

def make_feedback(n=1200):
    rows = []
    for i in range(n):
        label = random.choices(["Positive","Negative","Neutral"], weights=[0.45,0.35,0.20])[0]
        if label=="Positive":
            text = random.choice(positive)
        elif label=="Negative":
            text = random.choice(negative)
        else:
            text = random.choice(neutral)
        noise = "" if random.random()>0.3 else " " + random.choice(["Please help","Thanks","- user","(see order #123)"])
        rows.append({"id": i+1, "feedback": text + noise, "label": label})
    return pd.DataFrame(rows)

df = make_feedback(1200)
df.to_csv("customer_feedback.csv", index=False)
print("Saved customer_feedback.csv with", len(df), "rows")


Saved customer_feedback.csv with 1200 rows


In [5]:
import pandas as py
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

df = pd.read_csv("customer_feedback.csv")
df = df.drop_duplicates(subset=["feedback"]).reset_index(drop=True)

def clean_text(s):
    s = str(s).lower()
    s = re.sub(r"http\S+|www\S+","", s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df['clean'] = df['feedback'].apply(clean_text)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_tokens(s):
    tokens = word_tokenize(s)
    tokens = [t for t in tokens if t not in stop_words and len(t)>1]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

df['processed'] = df['clean'].apply(preprocess_tokens)
df = df[df['processed'].str.strip()!=''].reset_index(drop=True)

df.to_csv("customer_feedback_clean.csv", index=False)
print("Cleaned dataset saved as customer_feedback_clean.csv; rows:", len(df))


Cleaned dataset saved as customer_feedback_clean.csv; rows: 75


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manip\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manip\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
