## Data loading

In [2]:
import pandas as pd
import re

In [3]:
df = pd.read_json('Assignment_1_Assets/reviews_devset.json', lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,category
0,A2VNYWOPJ13AFP,0981850006,"Amazon Customer ""carringt0n""","[6, 7]",This was a gift for my other husband. He's ma...,5,Delish,1259798400,"12 3, 2009",Patio_Lawn_and_Garde
1,A2E5XXXC07AGA7,B00002N66D,James,"[1, 1]",This is a very nice spreader. It feels very s...,5,Nice spreader,1354492800,"12 3, 2012",Patio_Lawn_and_Garde
2,A16PX63WZIEQ13,B00002N67U,Finaldx,"[0, 1]",The metal base with the hose attachments is ve...,1,Terrible spike base,1218585600,"08 13, 2008",Patio_Lawn_and_Garde
3,A2OSWM3522VARA,B00002N6AN,"Wayne Allen ""Motoring Patriot""","[0, 0]",For the most part this works pretty good. I bo...,4,gets the job done,1254355200,"10 1, 2009",Patio_Lawn_and_Garde
4,A2SX9YPPGEUADI,B00002N8K3,"HappyCamper ""Happy Housewife""","[4, 5]",This hose is supposed to be flexible. Its har...,1,The worst,1373673600,"07 13, 2013",Patio_Lawn_and_Garde


We can observe the following columns and content:
* reviewerID - string - the ID of the author of the review
* asin - string - unique product identifier
* reviewerName - string - name of the reviewer
* helpful - array of two integers [a,b] - helpfulness rating of the review: a out of b customers found the review helpful
* reviewText - string - the content of the review; this is the text to be processed
* overall - float - rating given to product asin by reviewer reviewerID
* summary - string - the title of the review
* unixReviewTime - integer - timestamp of when review was created in UNIX format
* reviewTime - string - date when review was created in human readable format
* category - string - the category that the product belongs to


### Tokenization
Now, we will tokenize all words using the following delimiters:
* whitespaces
* tabs
* digits
* characters ()[]{}.!?,;:+=-_"'`~#@&*%€$§\/

We will use a function that loops thorugh the content and tokenizes each word and put it to lowercase..

In [4]:
df_tokenized = df.copy()

def tokenize_text(text):
    if pd.isna(text):
        return []

    # This pattern:
    # 1. Uses word boundaries to prevent first-letter cuts
    # 2. Still splits on all specified delimiters including - and _
    # 3. Handles punctuation correctly
    tokens = re.findall(r"\b[\w']+(?:-[\w']+)*\b", str(text).lower())
    return [token for token in tokens if token]

# Tokenize text columns
text_columns = ['reviewText', 'summary']
for col in text_columns:
    df_tokenized[col+'_tokens'] = df[col].apply(tokenize_text)

In [5]:
df_tokenized.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,category,reviewText_tokens,summary_tokens
0,A2VNYWOPJ13AFP,0981850006,"Amazon Customer ""carringt0n""","[6, 7]",This was a gift for my other husband. He's ma...,5,Delish,1259798400,"12 3, 2009",Patio_Lawn_and_Garde,"[this, was, a, gift, for, my, other, husband, ...",[delish]
1,A2E5XXXC07AGA7,B00002N66D,James,"[1, 1]",This is a very nice spreader. It feels very s...,5,Nice spreader,1354492800,"12 3, 2012",Patio_Lawn_and_Garde,"[this, is, a, very, nice, spreader, it, feels,...","[nice, spreader]"


### Stopwords filtering

Now, we will filter out stopwords that are contained in the stopwords.txt file. Further, we will filter out all tokens consisting of one character.

In [6]:
df_tokenized = df.copy()

# Load stopwords
with open('Assignment_1_Assets/stopwords.txt', 'r') as f:
    stopwords = set(line.strip() for line in f)

# Improved tokenization function with filtering
def tokenize_and_filter(text):
    if pd.isna(text):
        return []

    # 1. Define the exact delimiter characters provided
    #    We need to escape special regex characters like ., +, *, ?, ^, $, (, ), [, ], {, }, | \
    delimiter_chars = r'()\[\]{}.!?,;:+=\-_"\'`~#@&*%€$§\\/' # Note the double backslash for literal \

    # 2. Create the regex pattern:
    #    - Match one or more whitespace characters (\s+)
    #    - OR (|)
    #    - Match one or more digit characters (\d+)
    #    - OR (|)
    #    - Match one or more of the specified delimiter characters ([delimiter_chars]+)
    #    We group them to split on any sequence of these.
    split_pattern = rf'[\s\d{re.escape(delimiter_chars)}]+' # Use re.escape for safety

    # 3. Split the lowercase text using the pattern
    #    re.split can produce empty strings if delimiters are at the start/end
    #    or if multiple delimiters are adjacent.
    tokens = re.split(split_pattern, str(text).lower())

    # 4. Filter out empty strings, stopwords, and single-character tokens
    filtered_tokens = [
        token for token in tokens
        if token                                # Ensure token is not an empty string
           and token not in stopwords
           and len(token) > 1
    ]

    return filtered_tokens

# Apply to text columns
text_columns = ['reviewText', 'summary']
for col in text_columns:
    # Add a check if the column exists before applying
    if col in df.columns:
        df_tokenized[col+'_tokens'] = df[col].apply(tokenize_and_filter)
    else:
        print(f"Warning: Column '{col}' not found in DataFrame.")

In [9]:
print(df_tokenized.head(2))
dataframe_preprocessed = df_tokenized.copy()
dataframe_preprocessed.to_csv('chi_input.csv', index=False, columns = ['reviewText_tokens', 'category'])

       reviewerID        asin                  reviewerName helpful  \
0  A2VNYWOPJ13AFP  0981850006  Amazon Customer "carringt0n"  [6, 7]   
1  A2E5XXXC07AGA7  B00002N66D                         James  [1, 1]   

                                          reviewText  overall        summary  \
0  This was a gift for my other husband.  He's ma...        5         Delish   
1  This is a very nice spreader.  It feels very s...        5  Nice spreader   

   unixReviewTime  reviewTime              category  \
0      1259798400  12 3, 2009  Patio_Lawn_and_Garde   
1      1354492800  12 3, 2012  Patio_Lawn_and_Garde   

                                   reviewText_tokens    summary_tokens  
0  [gift, husband, making, things, time, love, fo...          [delish]  
1  [nice, spreader, feels, solid, pneumatic, tire...  [nice, spreader]  
