In [1]:
import pandas as pd
import json


In [7]:
with open("customer_feedback.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame(data)

print("Total rows:", len(df))
print("Sample rows:", df.sample(5))

Total rows: 100
Sample rows:     user_id     name  language                         feedback
5    user_5   User 5     Tamil             bad packaging!! thnx
39  user_39  User 39     Tamil         Fast delivery 😊!!! thnx👍
68  user_68  User 68   English              not happy bro<html>
77  user_77  User 77  Hinglish  damaged item 😡 fast pls macha\n
45  user_45  User 45  Hinglish          Great quality!!!!<html>


In [8]:
print("Columns:", df.columns.tolist())

Columns: ['user_id', 'name', 'language', 'feedback']


In [9]:
print("Data types:\n", df.dtypes)


Data types:
 user_id     object
name        object
language    object
feedback    object
dtype: object


In [10]:
print("Missing values:\n", df.isnull().sum())

Missing values:
 user_id      0
name        15
language     0
feedback     0
dtype: int64


In [11]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""

    # Remove emojis and special symbols
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove hashtags, mentions
    text = re.sub(r'[@#]\w+', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    # Lowercase everything
    return text.lower()

# Apply cleaning
df["cleaned_feedback"] = df["feedback"].apply(clean_text)

#Cleaned feedback
print(df[["feedback", "cleaned_feedback"]].sample(5))

                       feedback       cleaned_feedback
55            Nice product 😂              nice product
84                 not happy!!👍            not happy!!
72        Loved it! fast pls        loved it! fast pls
1   bad packaging!!!! bro<html>  bad packaging!!!! bro
93       damaged item 😡 macha\n     damaged item macha


In [12]:
#Replacing missing values
df["name"].fillna("Anonymous", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["name"].fillna("Anonymous", inplace=True)


In [16]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER
analyzer = SentimentIntensityAnalyzer()

# Function to get VADER sentiment score
def get_vader_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

# Apply to cleaned feedback
df["vader_score"] = df["cleaned_feedback"].apply(get_vader_sentiment)

# Classify sentiment
def classify_vader_sentiment(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

df["vader_sentiment"] = df["vader_score"].apply(classify_vader_sentiment)

# Show sentiment breakdown
print("\n📊 VADER Sentiment Distribution:")
print(df["vader_sentiment"].value_counts())

# View samples
print("\n🔍 VADER Positive Sample:")
print(df[df["vader_sentiment"] == "Positive"].sample(1)["cleaned_feedback"].values[0])

print("\n🔍 VADER Negative Sample:")
print(df[df["vader_sentiment"] == "Negative"].sample(1)["cleaned_feedback"].values[0])


📊 VADER Sentiment Distribution:
vader_sentiment
Negative    50
Positive    39
Neutral     11
Name: count, dtype: int64

🔍 VADER Positive Sample:
great quality!!!!

🔍 VADER Negative Sample:
damaged item bro


In [17]:
from collections import Counter
import string

def tokenize(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize and return
    return text.lower().split()


In [18]:
# Tokenize positive and negative feedback
positive_tokens = df[df["vader_sentiment"] == "Positive"]["cleaned_feedback"].apply(tokenize).sum()
negative_tokens = df[df["vader_sentiment"] == "Negative"]["cleaned_feedback"].apply(tokenize).sum()

# Count frequencies
positive_freq = Counter(positive_tokens)
negative_freq = Counter(negative_tokens)

# Remove common filler words (basic stopwords)
stopwords = set([
    'the', 'is', 'and', 'to', 'it', 'of', 'on', 'in', 'for', 'with', 'pls', 'bro', 'macha', 'thnx'
])

positive_keywords = {word: count for word, count in positive_freq.items() if word not in stopwords}
negative_keywords = {word: count for word, count in negative_freq.items() if word not in stopwords}

# Show top 5
print("\n✨ Top Compliments:")
print(Counter(positive_keywords).most_common(5))

print("\n🔥 Top Complaints:")
print(Counter(negative_keywords).most_common(5))



✨ Top Compliments:
[('nice', 11), ('product', 11), ('great', 11), ('quality', 11), ('loved', 9)]

🔥 Top Complaints:
[('bad', 13), ('packaging', 13), ('damaged', 13), ('item', 13), ('worst', 13)]
