# 1. Import data & define global variables

In [13]:
import pandas as pd
import numpy as np
import re
import math
from collections import Counter

CSV_FILE = "foodly_feedback.csv"

# 2. Create helper classes & functions

In [14]:
def parse_line(s):
    """
    Parse a single line from the CSV file with proper handling of quoted fields.

    This function handles CSV lines that contain commas within quoted fields,
    properly extracting the id, channel, rating, and text while preserving
    internal punctuation and formatting.

    Parameters:
    -----------
    s : str
        A single line from the CSV file

    Returns:
    --------
    tuple
        (doc_id (int), channel (str), rating (int), text (str))

    Example:
    --------
    >>> parse_line('1,app_store,5,"Love this app!"')
    (1, 'app_store', 5, 'Love this app!')

    Notes:
    ------
    - Handles double quotes within text by replacing "" with "
    - Splits on first 3 commas only to preserve commas in text field
    """
    s = s.strip().strip('"')
    doc_id, channel, rating, text = s.split(",", 3)
    text = text.strip().strip('"').replace('""','"')
    return int(doc_id), channel, int(rating), text

def clean_text(t):
    """
    Clean and normalize text for NLP processing.

    This function performs several text cleaning operations:
    1. Converts to lowercase
    2. Removes URLs (http/https/www)
    3. Removes all characters except letters, apostrophes, and spaces
    4. Collapses multiple spaces into single spaces

    Parameters:
    -----------
    t : str
        Raw text string to clean

    Returns:
    --------
    str
        Cleaned and normalized text

    Example:
    --------
    >>> clean_text("I LOVE this app! Check www.example.com")
    'i love this app check'

    Notes:
    ------
    Apostrophes are preserved to keep contractions like "didn't", "it's"
    """
    t = str(t).lower()
    # Remove URLs
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    # Keep only letters, apostrophes, and spaces
    t = re.sub(r"[^a-z'\s]", " ", t)
    # Collapse multiple spaces
    return re.sub(r"\s+", " ", t).strip()

def tokenize(t):
    """
    Tokenize text into individual words.

    This function first cleans the text using clean_text(), then splits it
    into individual words (tokens) and filters out any empty strings.

    Parameters:
    -----------
    t : str
        Text string to tokenize

    Returns:
    --------
    list of str
        List of individual word tokens

    Example:
    --------
    >>> tokenize("Love this app! It's great.")
    ['love', 'this', 'app', "it's", 'great']

    Notes:
    ------
    All tokens are at least 1 character long
    """
    return [w for w in clean_text(t).split() if len(w) >= 1]

# 3. Import data

In [15]:
# Filename: "foodly_feedback.csv"
lines = [l for l in open(CSV_FILE) if l.strip()]
rows = [parse_line(l) for l in lines[1:]]
df = pd.DataFrame(rows, columns=["doc_id","channel","rating","raw_text"])
print(df.head())

   doc_id    channel  rating  \
0       1  app_store       5   
1       2     in_app       1   
2       3     in_app       2   
3       4      email       3   
4       5      email       1   

                                            raw_text  
0  Love this app! use it every day for ordering p...  
1  I HATE the new update...crashes   every time I...  
2  delivery was late late late. driver said 'syst...  
3  Great UI, but why do you need my phone # again???  
4  Support didn't reply to my email (john.doe@exa...  


# 4. Curate dataset

In [16]:
df = df[["doc_id","rating","raw_text"]].dropna()
print(df.head())

   doc_id  rating                                           raw_text
0       1       5  Love this app! use it every day for ordering p...
1       2       1  I HATE the new update...crashes   every time I...
2       3       2  delivery was late late late. driver said 'syst...
3       4       3  Great UI, but why do you need my phone # again???
4       5       1  Support didn't reply to my email (john.doe@exa...


# 5. Extract docs

In [17]:
docs = df["raw_text"].tolist()
print(len(docs), docs[0][:80])

30 Love this app! use it every day for ordering pizza.  super   fast delivery!!!


# 6. Clean text

In [18]:
df["clean_text"] = df["raw_text"].map(clean_text)
print(df[["raw_text","clean_text"]].head())

                                            raw_text  \
0  Love this app! use it every day for ordering p...   
1  I HATE the new update...crashes   every time I...   
2  delivery was late late late. driver said 'syst...   
3  Great UI, but why do you need my phone # again???   
4  Support didn't reply to my email (john.doe@exa...   

                                          clean_text  
0  love this app use it every day for ordering pi...  
1  i hate the new update crashes every time i try...  
2  delivery was late late late driver said 'syste...  
3        great ui but why do you need my phone again  
4  support didn't reply to my email john doe exam...  


# 7. Tokenize

In [19]:
df["tokens"] = df["clean_text"].map(lambda x: x.split())
print(df["tokens"].head())

0    [love, this, app, use, it, every, day, for, or...
1    [i, hate, the, new, update, crashes, every, ti...
2    [delivery, was, late, late, late, driver, said...
3    [great, ui, but, why, do, you, need, my, phone...
4    [support, didn't, reply, to, my, email, john, ...
Name: tokens, dtype: object


# 8. Calculate term frequency

In [20]:
tf = df["tokens"].map(Counter).tolist()
print(tf[0].most_common(10))

[('love', 1), ('this', 1), ('app', 1), ('use', 1), ('it', 1), ('every', 1), ('day', 1), ('for', 1), ('ordering', 1), ('pizza', 1)]


# 9. Calculate document frequency

In [21]:
dfreq = Counter()
for toks in df["tokens"]:
    dfreq.update(set(toks))
print(len(dfreq))

229


# 10. Calculate inverse document frequency

In [22]:
N = len(df)
idf = {w: math.log((N+1)/(dfreq[w]+1)) + 1 for w in dfreq}
print(list(idf.items())[:5])

[('delivery', 3.0476928433652555), ('app', 2.35454566280531), ('love', 3.0476928433652555), ('use', 3.740840023925201), ('pizza', 3.740840023925201)]


# 11. Create dataframe: doc_id|rating|token|score

In [23]:
recs = []
for (doc_id, rating, toks), cnts in zip(df[["doc_id","rating","tokens"]].values, tf):
    L = max(len(toks), 1)
    for w, c in cnts.items():
        recs.append((doc_id, rating, w, (c/L) * idf[w]))

scores = pd.DataFrame(recs, columns=["doc_id","rating","token","score"])
print(scores.head())

   doc_id  rating token     score
0       1       5  love  0.234438
1       1       5  this  0.256567
2       1       5   app  0.181119
3       1       5   use  0.287757
4       1       5    it  0.287757


# 12. Get highest score tokens for high rating and low rating comments

In [24]:
top = lambda d: d.groupby("token")["score"].mean().sort_values(ascending=False).head(20)

high_top = top(scores[scores.rating >= 4])
low_top  = top(scores[scores.rating <= 2])

print("\nTop tokens for HIGH ratings (4-5 stars):")
print(high_top)
print("\nTop tokens for LOW ratings (1-2 stars):")
print(low_top)


Top tokens for HIGH ratings (4-5 stars):
token
pay           0.667075
was           0.555896
hurt          0.415649
dark          0.415649
mode          0.415649
feature       0.415649
eyes          0.415649
request       0.415649
at            0.415649
night         0.415649
add           0.374084
convenient    0.374084
apple         0.374084
seem          0.374084
than          0.374084
remember      0.374084
paypal        0.374084
website       0.374084
very          0.374084
higher        0.374084
Name: score, dtype: float64

Top tokens for LOW ratings (1-2 stars):
token
late           1.020229
fee            0.831298
number         0.534406
arrived        0.534406
never          0.534406
refund         0.534406
sus            0.467605
fake           0.467605
all            0.467605
still          0.467605
stars          0.467605
restaurant     0.467605
reinstalled    0.467605
ratings        0.467605
feel           0.467605
loading        0.467605
stuck          0.467605
uninstall