In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report



Dataset columns: Index(['name', 'review', 'rating'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183531 entries, 0 to 183530
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   name    183213 non-null  object
 1   review  182702 non-null  object
 2   rating  183531 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 4.2+ MB
                                                name  \
0                           Planetwise Flannel Wipes   
1                              Planetwise Wipe Pouch   
2                Annas Dream Full Quilt with 2 Shams   
3  Stop Pacifier Sucking without tears with Thumb...   
4  Stop Pacifier Sucking without tears with Thumb...   

                                              review  rating  
0  These flannel wipes are OK, but in my opinion ...       3  
1  it came early and was not disappointed. i love...       5  
2  Very soft and comfortable and warmer than it l...       5

In [None]:

# Load the dataset
data = pd.read_csv('amazon_baby.csv')

# Display basic information about the dataset
print("Dataset columns:", data.columns)
data.info()
print(data.head())



In [None]:
# Filter data for a specific product (example: 'Vulli Sophie the Giraffe Teether')
product_reviews = data[data['name'] == 'Vulli Sophie the Giraffe Teether']
print(f"Number of reviews for the product: {len(product_reviews)}")


In [None]:

# Preprocess the data
# Remove rows with missing values in the 'review' column
product_reviews = product_reviews.dropna(subset=['review'])

# Ensure all reviews are of string type
product_reviews['review'] = product_reviews['review'].astype(str)

# Remove rows with neutral ratings (3) and create binary labels for positive (4, 5) and negative (1, 2) ratings
product_reviews = product_reviews[product_reviews['rating'] != 3]
product_reviews['rating'] = product_reviews['rating'] >= 4  # Positive: True (>=4), Negative: False (1, 2)

# Create a dictionary of word counts for each review
vectorizer = CountVectorizer()
word_counts = vectorizer.fit_transform(product_reviews['review'])

# Add the word counts as a new column in the dataset with visible words
feature_names = vectorizer.get_feature_names_out()
word_counts_list = word_counts.toarray()
word_dicts = [
    {word: count for word, count in zip(feature_names, counts) if count > 0}
    for counts in word_counts_list
]
product_reviews['word_counts'] = word_dicts


In [None]:

# Define features (X) and target (y)
X = product_reviews['review']  # Text reviews
y = product_reviews['rating']  # Binary target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Convert text data into numerical format using CountVectorizer for model training
X_train_vectorized = vectorizer.transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_vectorized)



In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

In [2]:
product_reviews

Unnamed: 0,name,review,rating,word_counts
34313,Vulli Sophie the Giraffe Teether,He likes chewing on all the parts especially t...,True,"{'all': 1, 'and': 1, 'because': 1, 'been': 1, ..."
34314,Vulli Sophie the Giraffe Teether,My son loves this toy and fits great in the di...,True,"{'also': 1, 'and': 1, 'bag': 1, 'clean': 1, 'd..."
34315,Vulli Sophie the Giraffe Teether,There really should be a large warning on the ...,False,"{'2011': 1, 'all': 1, 'allergies': 1, 'allergy..."
34316,Vulli Sophie the Giraffe Teether,All the moms in my moms\' group got Sophie for...,True,"{'all': 1, 'and': 2, 'another': 1, 'babies': 1..."
34317,Vulli Sophie the Giraffe Teether,I was a little skeptical on whether Sophie was...,True,"{'20': 1, 'about': 2, 'after': 1, 'all': 1, 'a..."
...,...,...,...,...
159649,Vulli Sophie the Giraffe Teether,My baby loves her Sophie Chew Toy. She can che...,True,"{'all': 1, 'and': 1, 'baby': 1, 'can': 1, 'che..."
159650,Vulli Sophie the Giraffe Teether,Sophie the Giraffe was a big hit at the baby s...,True,"{'about': 1, 'all': 1, 'and': 1, 'as': 1, 'at'..."
159651,Vulli Sophie the Giraffe Teether,quick shipping and perfect product. I would pu...,True,"{'again': 1, 'and': 1, 'baby': 2, 'be': 1, 'bo..."
159652,Vulli Sophie the Giraffe Teether,My baby who is currently teething love his Sop...,True,"{'baby': 1, 'bit': 1, 'but': 1, 'chew': 1, 'cu..."
