#Data Preprocessing


In [None]:
import pandas as pd
url = "https://raw.githubusercontent.com/Pontakorn-Wich/Mini_project/master/data/books_1250_above_reviews.csv"
df = pd.read_csv(url)
df.head()

print(df.head())

    book_id                           user_id  \
0  25781157  d37b46b2190ed7c518259f29b47a9b36   
1  18774964  d37b46b2190ed7c518259f29b47a9b36   
2  12609433  d4b1dcb35db677f20ee45225a5e43be2   
3  18774964  b7dbd4518192923079be19c74e049608   
4  18774964  ced7b8e0a3340e8af27f2663f442c3bb   

                          review_id  rating  \
0  c159507f6f0c4010bcfcda6cee74a817       2   
1  3ab2b07073b3bd4134f1a1e8b0053b7f       2   
2  e95e9327ce1de178d99aebd20d3f80fb       3   
3  e5b3bbef0c8990dda81ff6f4885f62bd       5   
4  1da5c13281b915f7aa154d7ea768b654       5   

                                         review_text  \
0  This book has gotten many accolades but I foun...   
1  This is a story about a cantankerous and sad w...   
2  3.5 stars. Interesting, highly readable. I jus...   
3                      Loved, loved loved this book.   
4  This is possibly the best book I have ever rea...   

                       date_added                    date_updated  n_votes  \
0  Fri 

## Remove non-english reviews

In [None]:
get_ipython().system('pip install langdetect')

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m47.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=c10c038518ac042824dc5f3eaf4a57660df9855e228d7fff546548df60ac59d8
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
langdetect li

**Reasoning**:
The 'langdetect' library has been installed. Now, the next logical step is to use it to detect the language of the reviews in the `review_text` column. This involves importing the necessary function, defining a safe detection function to handle potential errors or empty strings, and applying it to create a new 'language' column.



In [None]:
from langdetect import detect, LangDetectException

def detect_language_safe(text):
    if isinstance(text, str) and text.strip():
        try:
            return detect(text)
        except LangDetectException:
            return 'unknown'
    return 'empty'

df['language'] = df['review_text'].apply(detect_language_safe)
print(df[['review_text', 'language']].head())

Language detection applied to 'review_text' column.
                                         review_text language
0  This book has gotten many accolades but I foun...       en
1  This is a story about a cantankerous and sad w...       en
2  3.5 stars. Interesting, highly readable. I jus...       en
3                      Loved, loved loved this book.       en
4  This is possibly the best book I have ever rea...       en


In [None]:
df_english = df[df['language'] == 'en'].copy()
print(df_english[['review_text', 'language']].head())

Filtered DataFrame to include only English reviews.
                                         review_text language
0  This book has gotten many accolades but I foun...       en
1  This is a story about a cantankerous and sad w...       en
2  3.5 stars. Interesting, highly readable. I jus...       en
3                      Loved, loved loved this book.       en
4  This is possibly the best book I have ever rea...       en


In [None]:
df = df_english

Original DataFrame updated to contain only English reviews.
                                         review_text language
0  This book has gotten many accolades but I foun...       en
1  This is a story about a cantankerous and sad w...       en
2  3.5 stars. Interesting, highly readable. I jus...       en
3                      Loved, loved loved this book.       en
4  This is possibly the best book I have ever rea...       en


##Special Characters Removal


In [None]:
import re
def remove_noise(text):
    # Remove characters that are not letters or spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_review_text'] = df['review_text'].apply(remove_noise)
print(df[['review_text', 'cleaned_review_text']].head())

Noise removal applied to 'review_text' column.
                                         review_text  \
0  This book has gotten many accolades but I foun...   
1  This is a story about a cantankerous and sad w...   
2  3.5 stars. Interesting, highly readable. I jus...   
3                      Loved, loved loved this book.   
4  This is possibly the best book I have ever rea...   

                                 cleaned_review_text  
0  This book has gotten many accolades but I foun...  
1  This is a story about a cantankerous and sad w...  
2  stars Interesting highly readable I just felt ...  
3                        Loved loved loved this book  
4  This is possibly the best book I have ever rea...  


##Normalization


In [None]:
df['normalized_text'] = df['cleaned_review_text'].str.lower()
df['normalized_text'] = df['normalized_text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

print("Text normalized (lowercase and whitespace removal).")
print(df[['cleaned_review_text', 'normalized_text']].head())

Text normalized (lowercase and whitespace removal).
                                 cleaned_review_text  \
0  This book has gotten many accolades but I foun...   
1  This is a story about a cantankerous and sad w...   
2  stars Interesting highly readable I just felt ...   
3                        Loved loved loved this book   
4  This is possibly the best book I have ever rea...   

                                     normalized_text  
0  this book has gotten many accolades but i foun...  
1  this is a story about a cantankerous and sad w...  
2  stars interesting highly readable i just felt ...  
3                        loved loved loved this book  
4  this is possibly the best book i have ever rea...  


##Stopword Removal

In [None]:
import nltk
nltk.download('stopwords')

'stopwords' corpus downloaded successfully.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


'punkt_tab' resource downloaded successfully as per error message.


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return ' '.join(filtered_sentence)

df['normalized_text_no_stopwords'] = df['normalized_text'].apply(remove_stopwords)
print(df[['normalized_text', 'normalized_text_no_stopwords']].head())

Stopwords removed from normalized_text' column.
                                     normalized_text  \
0  this book has gotten many accolades but i foun...   
1  this is a story about a cantankerous and sad w...   
2  stars interesting highly readable i just felt ...   
3                        loved loved loved this book   
4  this is possibly the best book i have ever rea...   

                        normalized_text_no_stopwords  
0  book gotten many accolades found snooze fest f...  
1  story cantankerous sad widower lost live boist...  
2  stars interesting highly readable felt example...  
3                             loved loved loved book  
4  possibly best book ever read giggling first pa...  


##Tokenization


In [None]:
from nltk.tokenize import word_tokenize

def tokenize_text(text):
    return word_tokenize(text)

df['tokenized_text'] = df['normalized_text_no_stopwords'].apply(tokenize_text)

print(df[['normalized_text_no_stopwords', 'tokenized_text']].head())

Text tokenization applied to 'normalized_text' column.
                        normalized_text_no_stopwords  \
0  book gotten many accolades found snooze fest f...   
1  story cantankerous sad widower lost live boist...   
2  stars interesting highly readable felt example...   
3                             loved loved loved book   
4  possibly best book ever read giggling first pa...   

                                      tokenized_text  
0  [book, gotten, many, accolades, found, snooze,...  
1  [story, cantankerous, sad, widower, lost, live...  
2  [stars, interesting, highly, readable, felt, e...  
3                        [loved, loved, loved, book]  
4  [possibly, best, book, ever, read, giggling, f...  


In [None]:
get_ipython().system('pip install spacy')
print("SpaCy library installed successfully.")

SpaCy library installed successfully.


**Reasoning**:
The SpaCy library has been installed. The next step, as per the subtask instructions, is to download a suitable English language model (e.g., `en_core_web_sm`) using `spacy.cli.download`.



In [None]:
import spacy

try:
    spacy.load('en_core_web_sm')
    print("'en_core_web_sm' model already downloaded.")
except OSError:
    print("Downloading 'en_core_web_sm' model...")
    spacy.cli.download('en_core_web_sm')
    print("'en_core_web_sm' model downloaded successfully.")


'en_core_web_sm' model already downloaded.


##Lemmatization

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def lemmatize_text(text):
    doc = nlp(text)
    # Extract lemmas for each token and join them into a string
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)

df['lemmatized_text'] = df['normalized_text'].apply(lemmatize_text)
print(df[['normalized_text', 'lemmatized_text']].head())

Lemmatization applied to 'normalized_text' column.
                                     normalized_text  \
0  this book has gotten many accolades but i foun...   
1  this is a story about a cantankerous and sad w...   
2  stars interesting highly readable i just felt ...   
3                        loved loved loved this book   
4  this is possibly the best book i have ever rea...   

                                     lemmatized_text  
0  this book have get many accolade but I find it...  
1  this be a story about a cantankerous and sad w...  
2  star interesting highly readable I just feel s...  
3                           love love love this book  
4  this be possibly the good book I have ever rea...  


##POS Tagging

In [None]:
import spacy

SpaCy library imported successfully.


In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def pos_tag_text(text):
    doc = nlp(text)
    # Extract POS tag for each token and join them into a string
    pos_tags = [(token.text, token.pos_) for token in doc]
    return pos_tags

df['pos_tagged_text'] = df['lemmatized_text'].apply(pos_tag_text)

print(df[['lemmatized_text', 'pos_tagged_text']].head())

POS tagging applied to 'lemmatized_text' column.
                                     lemmatized_text  \
0  this book have get many accolade but I find it...   
1  this be a story about a cantankerous and sad w...   
2  star interesting highly readable I just feel s...   
3                           love love love this book   
4  this be possibly the good book I have ever rea...   

                                     pos_tagged_text  
0  [(this, DET), (book, NOUN), (have, AUX), (get,...  
1  [(this, PRON), (be, AUX), (a, DET), (story, NO...  
2  [(star, PROPN), (interesting, ADJ), (highly, A...  
3  [(love, NOUN), (love, NOUN), (love, VERB), (th...  
4  [(this, PRON), (be, AUX), (possibly, ADV), (th...  


##Named Entity Recognition

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def extract_named_entities(text):
    doc = nlp(text)
    # Extract named entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

df['named_entities'] = df['lemmatized_text'].apply(extract_named_entities)
print(df[['lemmatized_text', 'named_entities']].head())

Named Entity Recognition applied to 'lemmatized_text' column.
                                     lemmatized_text      named_entities
0  this book have get many accolade but I find it...  [(four, CARDINAL)]
1  this be a story about a cantankerous and sad w...                  []
2  star interesting highly readable I just feel s...                  []
3                           love love love this book                  []
4  this be possibly the good book I have ever rea...  [(first, ORDINAL)]


In [None]:
print(df[['review_text', 'cleaned_review_text', 'normalized_text_no_stopwords', 'normalized_text', 'lemmatized_text', 'pos_tagged_text', 'named_entities']].head())


Sample of processed data showing evolution through stages:
                                         review_text  \
0  This book has gotten many accolades but I foun...   
1  This is a story about a cantankerous and sad w...   
2  3.5 stars. Interesting, highly readable. I jus...   
3                      Loved, loved loved this book.   
4  This is possibly the best book I have ever rea...   

                                 cleaned_review_text  \
0  This book has gotten many accolades but I foun...   
1  This is a story about a cantankerous and sad w...   
2  stars Interesting highly readable I just felt ...   
3                        Loved loved loved this book   
4  This is possibly the best book I have ever rea...   

                        normalized_text_no_stopwords  \
0  book gotten many accolades found snooze fest f...   
1  story cantankerous sad widower lost live boist...   
2  stars interesting highly readable felt example...   
3                             loved loved 

In [None]:
# # Sort the DataFrame by the length of the 'review_text' column
# df_sorted_by_review_length = df.copy()
# df_sorted_by_review_length['review_text_length'] = df_sorted_by_review_length['review_text'].apply(len)
# df_sorted_by_review_length = df_sorted_by_review_length.sort_values(by='review_text_length', ascending=True)
# print(df_sorted_by_review_length[['review_text', 'cleaned_review_text', 'normalized_text_no_stopwords', 'normalized_text', 'lemmatized_text', 'pos_tagged_text', 'named_entities']].head())


Sample of processed data sorted by 'review_text' length (shortest to longest):
      review_text cleaned_review_text normalized_text_no_stopwords  \
10557           T                   T                                
8160            I                   I                                
1512            i                   i                                
8178            .                                                    
8650            2                                                    

      normalized_text lemmatized_text pos_tagged_text named_entities  
10557               t               t    [(t, PROPN)]             []  
8160                i               I     [(I, PRON)]             []  
1512                i               I     [(I, PRON)]             []  
8178                                               []             []  
8650                                               []             []  
