#Data Preprocessing


In [12]:
import pandas as pd
url = "https://raw.githubusercontent.com/Pontakorn-Wich/Mini_project/master/data/books_1250_above_reviews.csv"
df = pd.read_csv(url)
df.head()

print(df.head())

    book_id                           user_id  \
0  13526165  8842281e1d1347389f2ab93d60773d4d   
1   9938498  8842281e1d1347389f2ab93d60773d4d   
2   2767052  8842281e1d1347389f2ab93d60773d4d   
3    136251  8842281e1d1347389f2ab93d60773d4d   
4  15507958  7504b2aee1ecb5b2872d3da381c6c91e   

                          review_id  rating  \
0  51fe3e46c7f8eb39f5623d1bd8bbbbfc       5   
1  bff5654c639c7b008571c3d4398d930a       4   
2  248c011811e945eca861b5c31a549291       5   
3  132eab4c9a3724493204cc083e0e2ecc       5   
4  63ff74279e46b247cb1754313b160006       4   

                                         review_text  \
0  My wife suggested I read this book, and I resi...   
1  Great story of the US Ambassador to Germany an...   
2  I cracked and finally picked this up. Very enj...   
3  Loved every minute. So sad there isn't another...   
4  I finished reading this days ago and cant get ...   

                       date_added                    date_updated  n_votes  \
0  Thu 

## Remove non-english reviews

In [13]:
get_ipython().system('pip install langdetect')

Defaulting to user installation because normal site-packages is not writeable
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 981.5/981.5 kB 13.1 MB/s  0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (pyproject.toml): started
  Building wheel for langdetect (pyproject.toml): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993363 sha256=6b65427445f2ccab90dd71fd5a81b1dba3b81eff5a8fa4ba083dbb98e2e89dad
  Stored in directory: c


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\kenpo\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


**Reasoning**:
The 'langdetect' library has been installed. Now, the next logical step is to use it to detect the language of the reviews in the `review_text` column. This involves importing the necessary function, defining a safe detection function to handle potential errors or empty strings, and applying it to create a new 'language' column.



In [14]:
from langdetect import detect, LangDetectException

def detect_language_safe(text):
    if isinstance(text, str) and text.strip():
        try:
            return detect(text)
        except LangDetectException:
            return 'unknown'
    return 'empty'

df['language'] = df['review_text'].apply(detect_language_safe)
print(df[['review_text', 'language']].head())

                                         review_text language
0  My wife suggested I read this book, and I resi...       en
1  Great story of the US Ambassador to Germany an...       en
2  I cracked and finally picked this up. Very enj...       en
3  Loved every minute. So sad there isn't another...       en
4  I finished reading this days ago and cant get ...       en


In [15]:
df_english = df[df['language'] == 'en'].copy()
print(df_english[['review_text', 'language']].head())

                                         review_text language
0  My wife suggested I read this book, and I resi...       en
1  Great story of the US Ambassador to Germany an...       en
2  I cracked and finally picked this up. Very enj...       en
3  Loved every minute. So sad there isn't another...       en
4  I finished reading this days ago and cant get ...       en


In [16]:
df = df_english

##Special Characters Removal


In [17]:
import re
def remove_noise(text):
    # Remove characters that are not letters or spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_review_text'] = df['review_text'].apply(remove_noise)
print(df[['review_text', 'cleaned_review_text']].head())

                                         review_text  \
0  My wife suggested I read this book, and I resi...   
1  Great story of the US Ambassador to Germany an...   
2  I cracked and finally picked this up. Very enj...   
3  Loved every minute. So sad there isn't another...   
4  I finished reading this days ago and cant get ...   

                                 cleaned_review_text  
0  My wife suggested I read this book and I resis...  
1  Great story of the US Ambassador to Germany an...  
2  I cracked and finally picked this up Very enjo...  
3  Loved every minute So sad there isnt another I...  
4  I finished reading this days ago and cant get ...  


##Normalization


In [18]:
df['normalized_text'] = df['cleaned_review_text'].str.lower()
df['normalized_text'] = df['normalized_text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

print("Text normalized (lowercase and whitespace removal).")
print(df[['cleaned_review_text', 'normalized_text']].head())

Text normalized (lowercase and whitespace removal).
                                 cleaned_review_text  \
0  My wife suggested I read this book and I resis...   
1  Great story of the US Ambassador to Germany an...   
2  I cracked and finally picked this up Very enjo...   
3  Loved every minute So sad there isnt another I...   
4  I finished reading this days ago and cant get ...   

                                     normalized_text  
0  my wife suggested i read this book and i resis...  
1  great story of the us ambassador to germany an...  
2  i cracked and finally picked this up very enjo...  
3  loved every minute so sad there isnt another i...  
4  i finished reading this days ago and cant get ...  


##Stopword Removal

In [19]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kenpo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kenpo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [21]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return ' '.join(filtered_sentence)

df['normalized_text_no_stopwords'] = df['normalized_text'].apply(remove_stopwords)
print(df[['normalized_text', 'normalized_text_no_stopwords']].head())

                                     normalized_text  \
0  my wife suggested i read this book and i resis...   
1  great story of the us ambassador to germany an...   
2  i cracked and finally picked this up very enjo...   
3  loved every minute so sad there isnt another i...   
4  i finished reading this days ago and cant get ...   

                        normalized_text_no_stopwords  
0  wife suggested read book resisted impression s...  
1  great story us ambassador germany experience b...  
2  cracked finally picked enjoyable quick read co...  
3  loved every minute sad isnt another thought jk...  
4  finished reading days ago cant get head heart ...  


##Tokenization


In [22]:
from nltk.tokenize import word_tokenize

def tokenize_text(text):
    return word_tokenize(text)

df['tokenized_text'] = df['normalized_text_no_stopwords'].apply(tokenize_text)

print(df[['normalized_text_no_stopwords', 'tokenized_text']].head())

                        normalized_text_no_stopwords  \
0  wife suggested read book resisted impression s...   
1  great story us ambassador germany experience b...   
2  cracked finally picked enjoyable quick read co...   
3  loved every minute sad isnt another thought jk...   
4  finished reading days ago cant get head heart ...   

                                      tokenized_text  
0  [wife, suggested, read, book, resisted, impres...  
1  [great, story, us, ambassador, germany, experi...  
2  [cracked, finally, picked, enjoyable, quick, r...  
3  [loved, every, minute, sad, isnt, another, tho...  
4  [finished, reading, days, ago, cant, get, head...  


In [23]:
get_ipython().system('pip install spacy')
print("SpaCy library installed successfully.")

Defaulting to user installation because normal site-packages is not writeable
SpaCy library installed successfully.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\kenpo\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


**Reasoning**:
The SpaCy library has been installed. The next step, as per the subtask instructions, is to download a suitable English language model (e.g., `en_core_web_sm`) using `spacy.cli.download`.



In [24]:
import spacy

try:
    spacy.load('en_core_web_sm')
    print("'en_core_web_sm' model already downloaded.")
except OSError:
    print("Downloading 'en_core_web_sm' model...")
    spacy.cli.download('en_core_web_sm')
    print("'en_core_web_sm' model downloaded successfully.")


'en_core_web_sm' model already downloaded.


##Lemmatization

In [25]:
import spacy

nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    doc = nlp(text)
    # Extract lemmas for each token and join them into a string
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)

df['lemmatized_text'] = df['normalized_text'].apply(lemmatize_text)
print(df[['normalized_text', 'lemmatized_text']].head())

                                     normalized_text  \
0  my wife suggested i read this book and i resis...   
1  great story of the us ambassador to germany an...   
2  i cracked and finally picked this up very enjo...   
3  loved every minute so sad there isnt another i...   
4  i finished reading this days ago and cant get ...   

                                     lemmatized_text  
0  my wife suggest I read this book and I resist ...  
1  great story of the us ambassador to germany an...  
2  I crack and finally pick this up very enjoyabl...  
3  love every minute so sad there be not another ...  
4  I finish read this day ago and can not get thi...  


##POS Tagging

In [26]:
import spacy

In [27]:
import spacy

nlp = spacy.load('en_core_web_sm')

def pos_tag_text(text):
    doc = nlp(text)
    # Extract POS tag for each token and join them into a string
    pos_tags = [(token.text, token.pos_) for token in doc]
    return pos_tags

df['pos_tagged_text'] = df['lemmatized_text'].apply(pos_tag_text)

print(df[['lemmatized_text', 'pos_tagged_text']].head())

                                     lemmatized_text  \
0  my wife suggest I read this book and I resist ...   
1  great story of the us ambassador to germany an...   
2  I crack and finally pick this up very enjoyabl...   
3  love every minute so sad there be not another ...   
4  I finish read this day ago and can not get thi...   

                                     pos_tagged_text  
0  [(my, PRON), (wife, NOUN), (suggest, VERB), (I...  
1  [(great, ADJ), (story, NOUN), (of, ADP), (the,...  
2  [(I, PRON), (crack, VERB), (and, CCONJ), (fina...  
3  [(love, NOUN), (every, DET), (minute, NOUN), (...  
4  [(I, PRON), (finish, VERB), (read, VERB), (thi...  


##Named Entity Recognition

In [28]:
import spacy

nlp = spacy.load('en_core_web_sm')

def extract_named_entities(text):
    doc = nlp(text)
    # Extract named entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

df['named_entities'] = df['lemmatized_text'].apply(extract_named_entities)
print(df[['lemmatized_text', 'named_entities']].head())

                                     lemmatized_text  \
0  my wife suggest I read this book and I resist ...   
1  great story of the us ambassador to germany an...   
2  I crack and finally pick this up very enjoyabl...   
3  love every minute so sad there be not another ...   
4  I finish read this day ago and can not get thi...   

                                      named_entities  
0          [(la, GPE), (seattle, GPE), (india, GPE)]  
1  [(us, GPE), (germany, GPE), (martha, PERSON), ...  
2                                [(each year, DATE)]  
3  [(every minute, TIME), (a month later, DATE), ...  
4                             [(this day ago, DATE)]  


In [29]:
print(df[['review_text', 'cleaned_review_text', 'normalized_text_no_stopwords', 'normalized_text', 'lemmatized_text', 'pos_tagged_text', 'named_entities']].head())

                                         review_text  \
0  My wife suggested I read this book, and I resi...   
1  Great story of the US Ambassador to Germany an...   
2  I cracked and finally picked this up. Very enj...   
3  Loved every minute. So sad there isn't another...   
4  I finished reading this days ago and cant get ...   

                                 cleaned_review_text  \
0  My wife suggested I read this book and I resis...   
1  Great story of the US Ambassador to Germany an...   
2  I cracked and finally picked this up Very enjo...   
3  Loved every minute So sad there isnt another I...   
4  I finished reading this days ago and cant get ...   

                        normalized_text_no_stopwords  \
0  wife suggested read book resisted impression s...   
1  great story us ambassador germany experience b...   
2  cracked finally picked enjoyable quick read co...   
3  loved every minute sad isnt another thought jk...   
4  finished reading days ago cant get head hea

In [30]:
# # Sort the DataFrame by the length of the 'review_text' column
# df_sorted_by_review_length = df.copy()
# df_sorted_by_review_length['review_text_length'] = df_sorted_by_review_length['review_text'].apply(len)
# df_sorted_by_review_length = df_sorted_by_review_length.sort_values(by='review_text_length', ascending=True)
# print(df_sorted_by_review_length[['review_text', 'cleaned_review_text', 'normalized_text_no_stopwords', 'normalized_text', 'lemmatized_text', 'pos_tagged_text', 'named_entities']].head())

In [31]:
df.to_csv("./data/book_processed_output.csv", index=False)