# Data Cleaning and Preprocessing Notebook

In [None]:
#importing libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
#loading file
file_path = 'Train.csv'
df = pd.read_csv(file_path)
print(df.head())

                            user_id   book_id  \
0  327858f6d967ef1567459b84252ac71a   8909152   
1  b1bb017c681370fddd19bab11f4eb22a     13152   
2  6fb896f0ccaeb445e2c9f580bff8f65d   9533378   
3  2edbb0dcf49ad138ef79bd6b5f4ba390     12067   
4  cc3ce566b0313a2f02d6ab246c990bce  20443207   

                          review_id  \
0  e8cb23191d6c27e930243a08ff826395   
1  953dfd48b372f081b5f82ce1def753f7   
2  48509a6f6128d4f2ca243e04a0cdc896   
3  a09f7ff4eca0c8c2fbaacf4baf6b114f   
4  93b0128f768ee9c1af8864f566e3a7b6   

                                         review_text  \
0  Really, I meant to get Landline when I checked...   
1  Update - 01/08/2016 They are making a Maximum ...   
2  I feel like I've read so many Urban Fantasy bo...   
3  Reread in December 2009. \n Simply a fantastic...   
4  BIG ASS DNF \n Ughhh. I'm so mad at myself for...   

                       date_added                    date_updated  \
0  Mon Aug 24 10:09:11 -0700 2015  Mon Aug 24 18:50:11 -0700 201

In [3]:
# Select specific columns
selected_columns = ['review_id', 'review_text', 'rating']
cleaned_df = df[selected_columns]
print(cleaned_df.head())

                          review_id  \
0  e8cb23191d6c27e930243a08ff826395   
1  953dfd48b372f081b5f82ce1def753f7   
2  48509a6f6128d4f2ca243e04a0cdc896   
3  a09f7ff4eca0c8c2fbaacf4baf6b114f   
4  93b0128f768ee9c1af8864f566e3a7b6   

                                         review_text  rating  
0  Really, I meant to get Landline when I checked...       4  
1  Update - 01/08/2016 They are making a Maximum ...       4  
2  I feel like I've read so many Urban Fantasy bo...       3  
3  Reread in December 2009. \n Simply a fantastic...       5  
4  BIG ASS DNF \n Ughhh. I'm so mad at myself for...       1  


## Preprocessing

This section focuses on preprocessing the 'review_text' column of the dataset to make it suitable for subsequent natural language processing (NLP) tasks. The preprocessing steps aim to clean and transform the text data into a format that is conducive for analysis, modeling, or other text-based applications.

In [8]:
# Preprocess the 'review_text' column
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming (lemmatization was slow)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # Join the tokens back into a string
    processed_text = ' '.join(tokens)
    return processed_text

# Apply the preprocessing function to the 'review_text' column
cleaned_df['review_text'] = cleaned_df['review_text'].apply(preprocess_text)

# Display the first few rows of the DataFrame after preprocessing
print(cleaned_df.head())

                          review_id  \
0  e8cb23191d6c27e930243a08ff826395   
1  953dfd48b372f081b5f82ce1def753f7   
2  48509a6f6128d4f2ca243e04a0cdc896   
3  a09f7ff4eca0c8c2fbaacf4baf6b114f   
4  93b0128f768ee9c1af8864f566e3a7b6   

                                         review_text  rating  
0  realli meant get landlin check one reason coul...       4  
1  updat make maximum ride movi look terribl http...       4  
2  feel like ive read mani urban fantasi book get...       3  
3  reread decemb simpli fantast read full humor m...       5  
4  big ass dnf ughhh im mad pick even care book o...       1  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['review_text'] = cleaned_df['review_text'].apply(preprocess_text)


The cleaned dataset is saved on a csv file with the name cleaned_train.csv

In [12]:
cleaned_file_path = 'cleaned_train.csv'
cleaned_df.to_csv(cleaned_file_path, index=False)

print(cleaned_df.head())

                          review_id  \
0  e8cb23191d6c27e930243a08ff826395   
1  953dfd48b372f081b5f82ce1def753f7   
2  48509a6f6128d4f2ca243e04a0cdc896   
3  a09f7ff4eca0c8c2fbaacf4baf6b114f   
4  93b0128f768ee9c1af8864f566e3a7b6   

                                         review_text  rating  
0  realli meant get landlin check one reason coul...       4  
1  updat make maximum ride movi look terribl http...       4  
2  feel like ive read mani urban fantasi book get...       3  
3  reread decemb simpli fantast read full humor m...       5  
4  big ass dnf ughhh im mad pick even care book o...       1  
