In [1]:
# The following code is only for Google Colab.
# If you are running this notebook locally, you should not run this cell.

# %pip install google-colab

# from google.colab import drive
# drive.mount('/content/drive/')

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import zipfile
from pathlib import Path
from nltk.tokenize import RegexpTokenizer

In [2]:
current_directory = Path(__file__).parent if '__file__' in locals() else Path.cwd()
print(f"Current directory: {current_directory}")
with zipfile.ZipFile(f"{current_directory}/data.zip", 'r') as zip_ref:
    zip_ref.extractall(current_directory)

Current directory: d:\RMIT\SEM B - 2025\Data\ASM3\ASM3-APDS


In [3]:
df = pd.read_csv('data/assignment3.csv')
df

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
1,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
2,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
3,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses
4,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,1,1,General Petite,Tops,Knits
...,...,...,...,...,...,...,...,...,...,...
19657,1104,34,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,General Petite,Dresses,Dresses
19658,862,48,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3,1,0,General Petite,Tops,Knits
19659,1104,31,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,General Petite,Dresses,Dresses
19660,1084,28,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,General,Dresses,Dresses


In [4]:
# 2/3/4. Tokenizing clothing review

# define tokenizer
tokenizer = RegexpTokenizer(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?")

# Tokenize the review text + lowercasing and removing words with length < 2
df["tokens"] = (
    df["Review Text"]
      .fillna("") # replace NaN with empty string
      .astype(str)
      .apply(lambda text: [token.lower() for token in tokenizer.tokenize(text) if len(token) > 1])
)

# quick spot-checks (delete when done)
print(df["tokens"].head(3).tolist())
assert isinstance(df["tokens"].iloc[0], list)

[['had', 'such', 'high', 'hopes', 'for', 'this', 'dress', 'and', 'really', 'wanted', 'it', 'to', 'work', 'for', 'me', 'initially', 'ordered', 'the', 'petite', 'small', 'my', 'usual', 'size', 'but', 'found', 'this', 'to', 'be', 'outrageously', 'small', 'so', 'small', 'in', 'fact', 'that', 'could', 'not', 'zip', 'it', 'up', 'reordered', 'it', 'in', 'petite', 'medium', 'which', 'was', 'just', 'ok', 'overall', 'the', 'top', 'half', 'was', 'comfortable', 'and', 'fit', 'nicely', 'but', 'the', 'bottom', 'half', 'had', 'very', 'tight', 'under', 'layer', 'and', 'several', 'somewhat', 'cheap', 'net', 'over', 'layers', 'imo', 'major', 'design', 'flaw', 'was', 'the', 'net', 'over', 'layer', 'sewn', 'directly', 'into', 'the', 'zipper', 'it'], ['love', 'love', 'love', 'this', 'jumpsuit', "it's", 'fun', 'flirty', 'and', 'fabulous', 'every', 'time', 'wear', 'it', 'get', 'nothing', 'but', 'great', 'compliments'], ['this', 'shirt', 'is', 'very', 'flattering', 'to', 'all', 'due', 'to', 'the', 'adjustable

In [5]:
# 5. Remove stopwords using the provided stop words list (i.e., stopwords_en.txt). It is located inside the same downloaded folder. 
# Find file stopwords in data
stop_candidates = [Path("stopwords_en.txt"), Path("data/stopwords_en.txt")]
stop_path = next((p for p in stop_candidates if p.exists()), None)
assert stop_path is not None, "stopwords_en.txt not found. Please put it in project root or inside data/."

# Load stopwords
with open(stop_path, "r", encoding="utf-8") as f:
    STOPWORDS = {line.strip() for line in f if line.strip()}

print(f"[Q5] Loaded {len(STOPWORDS)} stopwords from {stop_path}")

# Remove stopwords (Compare lowercase)
def remove_stopwords(tokens, stopset):
    return [t for t in tokens if t.lower() not in stopset]

df["tokens"] = df["tokens"].apply(lambda xs: remove_stopwords(xs, STOPWORDS))

# Print to check
print(df[["tokens"]].head(2))
print("df['tokens'] ready for Step 6 & 7.")

[Q5] Loaded 570 stopwords from data\stopwords_en.txt
                                              tokens
0  [high, hopes, dress, wanted, work, initially, ...
1  [love, love, love, jumpsuit, fun, flirty, fabu...
df['tokens'] ready for Step 6 & 7.


In [6]:
# 6. Remove the word that appears only once in the document collection, based on term frequency.
term_freq = pd.Series(np.concatenate(df["tokens"].values)).value_counts()
 
df["tokens"] = df["tokens"].apply(lambda tokens: [token for token in tokens if term_freq[token] > 1])

# Double check
term_freq = pd.Series(np.concatenate(df["tokens"].values)).value_counts()
term_freq

dress          9334
size           7860
love           7722
fit            6582
top            6542
               ... 
theatre           2
flatttering       2
cutie             2
exacerbated       2
thoughtful        2
Name: count, Length: 7549, dtype: int64

In [7]:

# 7. Remove the top 20 most frequent words based on document frequency. 
top_20_words = term_freq.nlargest(20).index
df["tokens"] = df["tokens"].apply(lambda tokens: [token for token in tokens if token not in top_20_words])

# Double check
term_freq = pd.Series(np.concatenate(df["tokens"].values)).value_counts()
term_freq

fits          2541
beautiful     2491
large         2485
material      2438
length        2366
              ... 
suspected        2
rhinestone       2
names            2
themed           2
film             2
Name: count, Length: 7529, dtype: int64

In [8]:
df["tokens"]

0        [high, hopes, wanted, work, initially, petite,...
1        [jumpsuit, fun, flirty, fabulous, time, compli...
2        [shirt, due, adjustable, front, tie, length, l...
3        [tracy, reese, dresses, petite, feet, tall, br...
4        [basket, hte, person, store, pick, teh, pale, ...
                               ...                        
19657         [happy, snag, price, easy, slip, cut, combo]
19658    [reminds, maternity, clothes, stretchy, shiny,...
19659                 [worked, glad, store, order, online]
19660    [wedding, summer, medium, fits, waist, perfect...
19661    [lovely, feminine, fits, perfectly, easy, comf...
Name: tokens, Length: 19662, dtype: object

In [9]:
df.to_csv(f"processed.csv", index=False)

In [10]:
# 9. Build a vocabulary of the cleaned/processed reviews, and save it in a txt file (please refer to the
# Required Output section);
vocabulary = set(np.concatenate(df["tokens"].values))
vocabulary = sorted(vocabulary)
vocab_dict = {word: i for i, word in enumerate(vocabulary)}
with open("vocabulary.txt", "w", encoding="utf-8") as f:
    for word, index in vocab_dict.items():
        f.write(f"{word}:{index}\n")


### Task 2: Generating Feature Representations for Clothing Reviews


##### Bag-of-words model


In [None]:
# === Task 2: Bag-of-Words ===
from pathlib import Path

#  — Load vocabulary (from Task 1) —
VOCAB_PATHS = [
    Path("vocabulary.txt"),          
]
VOCAB_PATH = next((p for p in VOCAB_PATHS if p.exists()), None)
assert VOCAB_PATH is not None, "Không tìm thấy vocab (vocab.txt / vocabulary.txt)."

word2idx = {}
with open(VOCAB_PATH, "r", encoding="utf-8") as f:
    for line in f:
        s = line.strip()
        if not s:
            continue
        w, sidx = s.rsplit(":", 1)
        word2idx[w] = int(sidx)

print(f"[Vocab] Loaded {len(word2idx)} words from {VOCAB_PATH}")


[Vocab] Loaded 7529 words from vocabulary.txt


In [18]:
# === Task 2: Choose clean ===
# Check df is available or not
if "df" not in globals():
    if Path("processed.csv").exists():
        df = pd.read_csv("processed.csv")
    else:
        raise RuntimeError("df is not defined. Run task1 cell first.")

# Set name tokens
CANDIDATE_TOKEN_COLS = ["tokens"]
SOURCE_COL = next((c for c in CANDIDATE_TOKEN_COLS if c in df.columns), None)
assert SOURCE_COL is not None, f"Không thấy cột tokens. Kỳ vọng một trong: {CANDIDATE_TOKEN_COLS}"

# make sure tokens are a list
def _ensure_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            import ast
            val = ast.literal_eval(x)
            return val if isinstance(val, list) else [x]
        except Exception:
            return x.split() 
    return []

df[SOURCE_COL] = df[SOURCE_COL].apply(_ensure_list)
df = df.reset_index(drop=True)

print(f"[BOW] Using tokens from column: {SOURCE_COL}")
print(df[SOURCE_COL].head(2).tolist())


[BOW] Using tokens from column: tokens
[['high', 'hopes', 'wanted', 'work', 'initially', 'petite', 'usual', 'found', 'outrageously', 'fact', 'zip', 'reordered', 'petite', 'medium', 'half', 'nicely', 'bottom', 'half', 'tight', 'layer', 'cheap', 'net', 'layers', 'imo', 'major', 'design', 'flaw', 'net', 'layer', 'sewn', 'directly', 'zipper'], ['jumpsuit', 'fun', 'flirty', 'fabulous', 'time', 'compliments']]


In [None]:
# === Task 2: Generate sparse Count Vectors -> count_vectors.txt ===
from collections import Counter

out_path = Path("count_vectors.txt")

def tokens_to_sparse_counts(tokens, word2idx):
    tokens = [t.lower() for t in tokens if isinstance(t, str)]
    ctr = Counter()
    for t in tokens:
        idx = word2idx.get(t)
        if idx is not None:
            ctr[idx] += 1
    if not ctr:
        return ""  
    parts = [f"{i}:{ctr[i]}" for i in sorted(ctr)]
    return ",".join(parts)

with open(out_path, "w", encoding="utf-8") as fout:
    for i, toks in enumerate(df[SOURCE_COL]):
        sparse = tokens_to_sparse_counts(toks, word2idx)
        fout.write(f"#{i},{sparse}\n")

print(f"[BOW] Wrote {len(df)} lines to {out_path.resolve()}")
# Print to check some first rows
with open(out_path, "r", encoding="utf-8") as f:
    for _ in range(3):
        print(f.readline().rstrip())


[BOW] Wrote 19662 lines to D:\RMIT\SEM B - 2025\Data\ASM3\ASM3-APDS\count_vectors.txt
#0,686:1,1027:1,1715:1,1791:1,2288:1,2481:1,2602:1,2892:2,3010:1,3087:1,3193:1,3258:1,3549:2,3552:1,3832:1,3934:1,4224:2,4234:1,4427:1,4639:2,5260:1,5668:1,6726:1,7092:1,7207:1,7406:1,7520:1,7522:1
#1,1286:1,2283:1,2502:1,2667:1,3403:1,6739:1
#2,86:1,924:1,1987:1,2646:1,3584:1,3595:1,4506:1,5736:2,5924:1,6716:1


##### Check Sanity

In [None]:
# === Optional sanity checks ===
# 1) vocab index must be from 0..N-1
N = len(word2idx)
ok_index_set = set(range(N)) == set(word2idx.values())
print("[Check] Vocab indices contiguous 0..N-1:", ok_index_set)

# 2) Every index in count_vectors.txt < N
bad = False
with open("count_vectors.txt", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(",", 1)
        if len(parts) == 2 and parts[1]:
            for kv in parts[1].split(","):
                i, c = kv.split(":")
                if int(i) >= N or int(c) <= 0:
                    bad = True; break
        if bad: break
print("[Check] All indices valid and counts > 0:", not bad)


[Check] Vocab indices contiguous 0..N-1: True
[Check] All indices valid and counts > 0: True
