In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords

In [None]:
# Download NLTK data
nltk.download('stopwords')

In [None]:
# Load the dataset 
# sentiment = 1 (positive), sentiment = 0 (negative)
data = pd.read_csv('amazon_echo_reviews.csv')

# Preview the dataset
data.head()

# Check for missing values
print("Missing values:", data.isnull().sum())

# Drop missing values if any
data.dropna(inplace=True)

# Explore target distribution
sns.countplot(x='sentiment', data=data)
plt.title('Distribution of Sentiment in Reviews')
plt.show()

In [None]:
# Preprocessing: Clean and prepare the text data
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply the cleaning function to the reviews
data['cleaned_review'] = data['review'].apply(clean_text)

# Display a few cleaned reviews
data[['review', 'cleaned_review']].head()

# Convert text into numerical features using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['cleaned_review']).toarray()

# Target variable
y = data['sentiment']


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model building: Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()



In [None]:
# Test with a new review (custom input)
def predict_sentiment(review):
    review_cleaned = clean_text(review)
    review_vectorized = tfidf_vectorizer.transform([review_cleaned]).toarray()
    prediction = model.predict(review_vectorized)
    sentiment = "Positive" if prediction == 1 else "Negative"
    return sentiment



In [None]:
# Test with an example
new_review = "I love the Amazon Echo, it's really helpful!"
predicted_sentiment = predict_sentiment(new_review)
print(f"Review: {new_review}")
print(f"Predicted Sentiment: {predicted_sentiment}")