# Project NLP | Business Case: Automated Customer Reviews

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load the dataset             
data1 = pd.read_csv('data/1429_1.csv')
data2 = pd.read_csv('data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')
data3 = pd.read_csv('data/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')

# Combine the datasets

data = pd.merge(data1, data2, how='outer')
data = pd.merge(data, data3, how='outer')




# Remove duplicates if necessary
data.drop_duplicates(inplace=True)

# Step 5: Save the Combined Dataset
data.to_csv('data/combined_data.csv', index=False)

print("Data successfully combined and saved to 'combined_data.csv'.")


# Inspect the first few rows of the data
print("Data Head:")
print(data.head())

# Check for any missing values
print("Missing Values in Data:")
print(data.isnull().sum())

# Display basic statistics for the data
print("Data Statistics:")
print(data.describe())

# Display columns to understand the structure
print("Data Columns:")
print(data.columns)



## Drop unecessary columns

In [None]:
# Columns to drop
columns_to_drop = [
    'id', 'name', 'asins', 'brand', 'categories', 'keys', 'manufacturer', 
    'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen', 'reviews.didPurchase', 
    'reviews.id', 'reviews.sourceURLs', 'reviews.userCity', 'reviews.userProvince', 'reviews.username', 'primaryCategories',
    'imageURLs', 'manufacturerNumber',  'sourceURLs', 'dateUpdated', 'dateAdded'
]

# Drop the columns
data_cleaned = data.drop(columns=columns_to_drop)

# Rename columns

data_cleaned.columns = data_cleaned.columns.str.replace('reviews.', '', regex=False)

# Display columns to confirm the drop
print("Data Columns After Dropping:")
print(data_cleaned.columns)


## Check for missing values

In [None]:
# Check for any missing values
print("Missing Values in Data:")
print(data_cleaned.isnull().sum())

## Drop rows missing text and rating

In [None]:
# Drop rows with missing 'text' and 'rating'
data_cleaned = data_cleaned.dropna(subset=['text', 'rating'])

# Check for any remaining missing values
print("Missing Values in Data After Dropping Rows:")
print(data_cleaned.isnull().sum())

## Plot the Distribution of Rating and doRecommend

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns



# Plot the distribution of ratings
plt.figure(figsize=(10, 5))
sns.countplot(data=data_cleaned, x='rating')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

# Plot the distribution of doRecommend
plt.figure(figsize=(10, 5))
sns.countplot(data=data_cleaned, x='doRecommend')
plt.title('Distribution of Recommendations')
plt.xlabel('Do Recommend')
plt.ylabel('Count')
plt.show()

# Joint plot to explore correlation
plt.figure(figsize=(10, 5))
sns.boxplot(x='rating', y='doRecommend', data=data_cleaned)
plt.title('Box Plot of Ratings vs. Recommendations')
plt.xlabel('Rating')
plt.ylabel('Do Recommend')
plt.show()


## Calculate Review Length

In [None]:
# Calculate the length of each review
data_cleaned['review_length'] = data_cleaned['text'].apply(len)

# Display the first few rows to confirm the new column
print(data_cleaned[['text', 'review_length']].head())


## Correlation between review lenght and rating

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot of review length vs. rating
plt.figure(figsize=(10, 5))
sns.scatterplot(data=data_cleaned, x='review_length', y='rating')
plt.title('Review Length vs. Rating')
plt.xlabel('Review Length')
plt.ylabel('Rating')
plt.show()

# Scatter plot of review length vs. doRecommend
plt.figure(figsize=(10, 5))
sns.scatterplot(data=data_cleaned, x='review_length', y='doRecommend')
plt.title('Review Length vs. Do Recommend')
plt.xlabel('Review Length')
plt.ylabel('Do Recommend')
plt.show()


In [None]:
# Calculate correlation coefficient between review length and rating
correlation_rating = data_cleaned['review_length'].corr(data_cleaned['rating'])
print(f'Correlation between Review Length and Rating: {correlation_rating:.2f}')

# Calculate correlation coefficient between review length and doRecommend
correlation_doRecommend = data_cleaned['review_length'].corr(data_cleaned['doRecommend'])
print(f'Correlation between Review Length and Do Recommend: {correlation_doRecommend:.2f}')


## encode ratings into positive (2), neutral (1), negative (0)

In [None]:
# Function to categorize ratings
def categorize_rating(rating):
    if rating in [4, 5]:
        return 2  # Positive
    elif rating == 3:
        return 1  # Neutral
    else:
        return 0  # Negative

# Apply the function to the rating column
data_cleaned['sentiment'] = data_cleaned['rating'].apply(categorize_rating)

# Display the first few rows to confirm the new column
print(data_cleaned[['rating', 'sentiment']].head())

# Check the distribution of the new categories
print("Distribution of Rating Categories:")
print(data_cleaned['sentiment'].value_counts())

# Plot the distribution of Sentiment
plt.figure(figsize=(10, 5))
sns.countplot(data=data_cleaned, x='sentiment')
plt.title('Distribution of Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()


## Word clouds

In [None]:
from wordcloud import WordCloud

# Generate word clouds
negative = ' '.join(data_cleaned[data_cleaned['sentiment'] == 0]['text'])
neutral = ' '.join(data_cleaned[data_cleaned['sentiment'] == 1]['text'])
positive = ' '.join(data_cleaned[data_cleaned['sentiment'] == 2]['text'])

# Word cloud for negatige reviews
wordcloud_negative = WordCloud(width=800, height=400, background_color='white').generate(negative)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Negative Reviews')
plt.show()


# Word cloud for neutral reviews
wordcloud_neutral = WordCloud(width=800, height=400, background_color='white').generate(neutral)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_neutral, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Neutral Reviews')
plt.show()

# Word cloud for positive reviews
wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Real News')
plt.show()

## Data Cleaning (Lowercase, Remove Stopwords, Lemmatization)

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE


# Download NLTK stopwords and WordNet data if not already done
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Define additional stop words 
additional_stop_words = {'tablet', 'amazon', 'kindle', 'bought', 'one', 'use'}


# Define the preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    stop_words = set(stopwords.words('english')).union(additional_stop_words)
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatization
    return text

# Apply preprocessing
data_cleaned['cleaned_text'] = data_cleaned['text'].apply(preprocess_text)



## Split the data into training and testing sets

In [12]:
from sklearn.model_selection import train_test_split


# Feature matrix (X) and target vector (Y)
X = data_cleaned['cleaned_text']
Y = data_cleaned['sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


## Vectorization

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

## Resampling

In [None]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vectorized, y_train)

# Confirm the resampling
print("Original training set class distribution:")
print(y_train.value_counts())
print("\nResampled training set class distribution:")
print(pd.Series(y_train_resampled).value_counts())

## Train and Evaluate models

In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import pandas as pd

# Function to evaluate model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 Score: {f1:.2f}')
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# Train and evaluate models
def train_and_evaluate_models(X_train_resampled, y_train_resampled, X_test_vectorized, y_test):
    models = {
        'Random Forest': RandomForestClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42),
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
    }
    
    for name, model in models.items():
        print(f'\nTraining {name}...')
        model.fit(X_train_resampled, y_train_resampled)
        print(f'Evaluating {name}...')
        evaluate_model(model, X_test_vectorized, y_test)


In [None]:
# Train and evaluate models
train_and_evaluate_models(X_train_resampled, y_train_resampled, X_test_vectorized, y_test)


## Class Weight Adjustment in Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define the model with class weights
class_weights = {0: 2, 1: 2, 2: 1}
rf_model = RandomForestClassifier(random_state=42, class_weight=class_weights)

# Train the model
rf_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
evaluate_model(rf_model, X_test_vectorized, y_test)


## Biderectional LSTM model

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.utils import to_categorical



# Encode target labels
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)
Y_categorical = to_categorical(Y_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y_categorical, test_size=0.2, random_state=42)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_sequence_length = max([len(x) for x in X_train_sequences])
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Build the Bidirectional LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=10, batch_size=64, validation_data=(X_test_padded, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Accuracy: {accuracy:.2f}')

# Make predictions
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Print classification report
from sklearn.metrics import classification_report
print(classification_report(y_test_classes, y_pred_classes, target_names=label_encoder.classes_))


## Hyperparameter Tuning with Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest model with class weights
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Initialize the Grid Search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1_weighted')

# Fit the Grid Search model
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best Parameters: {best_params}')
print(f'Best F1 Score: {best_score}')
