<a href="https://colab.research.google.com/github/Ramit-tharu/Intelligent-system/blob/Master/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import pandas as pd


# Specify the path to your dataset (adjust the path based on your file's location in Google Drive)
file_path = '/content/drive/My Drive/Datasets/Ass-3/all_reviews.csv'

# Load the dataset into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few records
print(df.head(10))


               Category ReviewType  \
0  kitchen_&_housewares   negative   
1  kitchen_&_housewares   negative   
2  kitchen_&_housewares   negative   
3  kitchen_&_housewares   negative   
4  kitchen_&_housewares   negative   
5  kitchen_&_housewares   negative   
6  kitchen_&_housewares   negative   
7  kitchen_&_housewares   negative   
8  kitchen_&_housewares   negative   
9  kitchen_&_housewares   negative   

                                              Review  
0                                           <review>  
1                                        <unique_id>  
2  B0002D31QU:doesn't_keep_my_yappy_cairn_from_ba...  
3                                       </unique_id>  
4                                             <asin>  
5                                         B0002D31QU  
6                                            </asin>  
7                                     <product_name>  
8  Premier Gentle Spray Anti-Bark Dog Collar: Kit...  
9                              

In [2]:
print(f"The dataset has {df.shape[0]} records.")

The dataset has 2730434 records.


**Cleaning the Dataset for spelling and punctuation**


In [3]:
import re
import pandas as pd
def clean_text(text):
    # Converting input to string
    text = str(text)
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Applying cleaning function to the 'Review' column
df['Cleaned_Review'] = df['Review'].apply(clean_text)

# Displaying the cleanded rows after cleaning
print("\nFirst 10 records after cleaning:")
print(df[['Review', 'Cleaned_Review']].head(10))


First 10 records after cleaning:
                                              Review  \
0                                           <review>   
1                                        <unique_id>   
2  B0002D31QU:doesn't_keep_my_yappy_cairn_from_ba...   
3                                       </unique_id>   
4                                             <asin>   
5                                         B0002D31QU   
6                                            </asin>   
7                                     <product_name>   
8  Premier Gentle Spray Anti-Bark Dog Collar: Kit...   
9                                    </product_name>   

                                      Cleaned_Review  
0                                             review  
1                                          unique_id  
2  bdqudoesnt_keep_my_yappy_cairn_from_barkingd_o...  
3                                          unique_id  
4                                               asin  
5                  

**Encoding the Words**


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Choosing the parameters
max_vocab_size = 10000 # this is the maximum number of unique words
max_sequence_length = 100 # maximum review length after padding

# Initializing the tokenizer
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>") # OOV for out-of-vocabulary words

# Fitting the tokenizer on the cleaned reviews
tokenizer.fit_on_texts(df['Cleaned_Review'])

# Converting reviews to sequences of integers
sequences = tokenizer.texts_to_sequences(df['Cleaned_Review'])

# Padding sequences to the same length
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Adding the encoded reviews to the DataFrame
df['Encoded_Review'] = list(padded_sequences)

# Displaying the first few rows with encoded reviews
print("\nFirst 10 records with encoded reviews:")
print(df[['Cleaned_Review', 'Encoded_Review']].head(10))


First 10 records with encoded reviews:
                                      Cleaned_Review  \
0                                             review   
1                                          unique_id   
2  bdqudoesnt_keep_my_yappy_cairn_from_barkingd_o...   
3                                          unique_id   
4                                               asin   
5                                               bdqu   
6                                               asin   
7                                       product_name   
8  premier gentle spray antibark dog collar kitch...   
9                                       product_name   

                                      Encoded_Review  
0  [6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [19, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...  
2  [1, 303, 39, 1, 1, 52, 1, 1, 0, 0, 0, 0, 0, 0,...  
3  [19, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...  
4  [25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...  
5  [1, 0, 0, 

Encoding labels for negative and positive review

In [14]:
# Encoding the labels for 'positive' and 'negative'
label_mapping = {'positive': 1, 'negative': 0}
df['Encoded_label'] = df['ReviewType'].map(label_mapping)

# Displaying the first few rows with encoded labels
print("\nFirst 10 records with encoded labels:")
print(df[['ReviewType', 'Encoded_label']].head(10))


First 10 records with encoded labels:
  ReviewType  Encoded_label
0   negative            0.0
1   negative            0.0
2   negative            0.0
3   negative            0.0
4   negative            0.0
5   negative            0.0
6   negative            0.0
7   negative            0.0
8   negative            0.0
9   negative            0.0


**Outlier Removal**

In [16]:
# Setting minimum length for reviews
min_length = 10

# Filtering out reviews that are too short in the dataset
df = df[df['Cleaned_Review'].str.len() >= min_length]

# Resetting the index for cleanliness after filtering
df = df.reset_index(drop=True)

# Displaying the number of records remaining and first few rows
print(f"Number of records after outlier removal: {df.shape[0]}")
print("\nFirst 10 records after outlier removal:")
print(df[['Cleaned_Review', 'Encoded_Review']].head(10))

Number of records after outlier removal: 1101256

First 10 records after outlier removal:
                                      Cleaned_Review  \
0  bdqudoesnt_keep_my_yappy_cairn_from_barkingd_o...   
1                                       product_name   
2  premier gentle spray antibark dog collar kitch...   
3                                       product_name   
4                                       product_type   
5                                 kitchen housewares   
6                                       product_type   
7            doesnt keep my yappy cairn from barking   
8                                  reviewer_location   
9                                  reviewer_location   

                                      Encoded_Review  
0  [1, 303, 39, 1, 1, 52, 1, 1, 0, 0, 0, 0, 0, 0,...  
1  [5, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...  
2  [5791, 3485, 3465, 1, 805, 8033, 46, 50, 0, 0,...  
3  [5, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...  
4  [5, 16, 0, 0, 0

**Cleaning out empty reviews**


In [18]:
df = df[df['Cleaned_Review'].str.strip() != '']
df = df.dropna(subset=['Cleaned_Review'])
df = df.reset_index(drop=True)


**Padding and Truncating the remaining Data**

In [20]:
# Defining the maximum length for the reviews
max_sequence_length = 100

# Assuming 'Encoded_Review' column contains the sequences to be padded
sequences = df['Encoded_Review'].tolist()  # Extract sequences from the current DataFrame

# Padding sequences to the same length
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Replacing the encoded reviews column with the padded versions
df['Padded_Encoded_Review'] = list(padded_sequences)

# Displaying the first few rows with padded reviews
print("\nFirst 10 records with padded reviews:")
print(df[['Cleaned_Review', 'Padded_Encoded_Review']].head(10))


First 10 records with padded reviews:
                                      Cleaned_Review  \
0  bdqudoesnt_keep_my_yappy_cairn_from_barkingd_o...   
1                                       product_name   
2  premier gentle spray antibark dog collar kitch...   
3                                       product_name   
4                                       product_type   
5                                 kitchen housewares   
6                                       product_type   
7            doesnt keep my yappy cairn from barking   
8                                  reviewer_location   
9                                  reviewer_location   

                               Padded_Encoded_Review  
0  [1, 303, 39, 1, 1, 52, 1, 1, 0, 0, 0, 0, 0, 0,...  
1  [5, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...  
2  [5791, 3485, 3465, 1, 805, 8033, 46, 50, 0, 0,...  
3  [5, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...  
4  [5, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...  
5  [46, 50, 0,