# 🛍️ e-Commerce: NLP and Sentiment Analysis Part 2

In [None]:
!pip install scipy==1.7.0

In [None]:
import scipy
scipy.__version__

In [None]:
pip install -U scikit-learn

In [None]:
import sklearn
sklearn.__version__

In [None]:
!apt install -y build-essential swig curl
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
!pip install auto-sklearn

Please find the part 1 here https://www.kaggle.com/rendyk/e-commerce-nlp-and-sentiment-analysis-part-1

It contains the EDA of the dataset and basic NLP 

In [None]:
# Import the packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from autosklearn.regression import AutoSklearnRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import confusion_matrix

In [None]:
# Load dataset
dataset = pd.read_csv('../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv').iloc[:,1:]

# Filter rows with column
data = dataset.loc[dataset['Review Text'].notnull(),:].reset_index()
data.head()

In [None]:
# Applying text blob sentiment
def polarity(t):
    a = TextBlob(t).sentiment
    return a[0]

def subjectivity(t):
    a = TextBlob(t).sentiment
    return a[1]

data['polarity'] = data.apply(lambda t: polarity(t['Review Text']), axis=1)
data['subjectivity'] = data.apply(lambda t: subjectivity(t['Review Text']), axis=1)
data.head()

In [None]:
# Polarity and Rating
sns.boxplot(data=data, x='Rating', y='polarity')
plt.show()

In [None]:
data.groupby('Rating').median()[['polarity', 'subjectivity']]

# 9. Sentiment Analysis: BoW (Bag-of-Words)

In [None]:
# Apply uni- and bigram vectorizer
class lemmatizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, df):
        return [self.wnl.lemmatize(word) for word in word_tokenize(df)]

vectorizer = CountVectorizer(max_features=60, max_df=0.95, min_df=0.05, ngram_range=(1,2),
                             tokenizer=lemmatizer(), lowercase=True, stop_words='english',
                             token_pattern = r'\w+')

vectorizer.fit(data['Review Text'])
count_vector = vectorizer.transform(data['Review Text'])

bow_data = count_vector.toarray()
bow_data = pd.DataFrame(bow_data, columns=vectorizer.get_feature_names())
bow_data.head()

In [None]:
# Removing meaningless columns
bow_data2 = bow_data.iloc[:,10:]

# Split data
X_train, X_test, y_train, y_test = train_test_split(bow_data,
                                                  data['Rating'],
                                                  stratify=data['Rating'],
                                                  test_size=0.2, random_state=123)

In [None]:
# Create the model
sklearn = AutoSklearnRegressor(time_left_for_this_task=3*60, per_run_time_limit=60, n_jobs=-1)

# Fit the training data
sklearn.fit(X_train, y_train)

# Sprint Statistics
print(sklearn.sprint_statistics())

# Predict the test data
pred_sklearn = sklearn.predict(X_test)
pred_sklearn2 = [round(i) for i in pred_sklearn]

# Compute the RMSE
rmse_sklearn = mean_squared_error(y_test, pred_sklearn2)**0.5
print('RMSE: ' + str(rmse_sklearn))

In [None]:
# Show the models
print(sklearn.show_models())

In [None]:
# Prediction results
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_test, pred_sklearn2), index=[1,2,3,4,5], columns=[1,2,3,4,5]))

# 10. Sentiment Analysis: Tf-Idf (Term Frequency — Inverse Document Frequency)

In [None]:
# Creating Tf-Idf
tfidf = TfidfVectorizer(max_features=70)
tfidf.fit(data['Review Text'])
tfidf_data = tfidf.transform(data['Review Text'])

tfidf_data = pd.DataFrame(tfidf_data.toarray(), columns=tfidf.get_feature_names())
tfidf_data.head()

In [None]:
# Preparing the training and test data
X_train_idf = tfidf_data.loc[X_train.index,:]
X_test_idf = tfidf_data.loc[X_test.index,:]

In [None]:
# Create the model
sklearn_idf = AutoSklearnRegressor(time_left_for_this_task=3*60, per_run_time_limit=60, n_jobs=-1)

# Fit the training data
sklearn_idf.fit(X_train_idf, y_train)

# Sprint Statistics
print(sklearn_idf.sprint_statistics())

# Predict the test data
pred_sklearn_idf = sklearn_idf.predict(X_test_idf)
pred_sklearn_idf2 = [round(i) for i in pred_sklearn_idf]

# Compute the RMSE
rmse_sklearn_idf = mean_squared_error(y_test, pred_sklearn_idf2)**0.5
print('RMSE: ' + str(rmse_sklearn_idf))

In [None]:
# Show the models
print(sklearn_idf.show_models())

In [None]:
# Prediction results
print('Confusion Matrix')
pred_sklearn_idf3 = [i if i <= 5 else 5 for i in pred_sklearn_idf2]
print(pd.DataFrame(confusion_matrix(y_test, pred_sklearn_idf3), index=[1,2,3,4,5], columns=[1,2,3,4,5]))

# 11. AutoKeras: Text Regression

In [None]:
!pip install autokeras

In [None]:
import autokeras as ak

In [None]:
# Preparing the data for autokeras
X_train_ak = np.array(data.loc[X_train.index, 'Review Text'])
y_train_ak = np.array(data.loc[X_train.index, 'Rating'])
X_test_ak = np.array(data.loc[X_test.index, 'Review Text'])
y_test_ak = np.array(data.loc[X_test.index, 'Rating'])

In [None]:
# Create the model
keras = ak.TextRegressor(overwrite=True, max_trials=3)

# Fit the training dataset
keras.fit(X_train_ak, y_train_ak, epochs=30, validation_split=0.2)

In [None]:
# Show the built models
keras_export = keras.export_model()
keras_export.summary()

In [None]:
from itertools import chain

In [None]:
# Predict the test data
pred_keras = keras.predict(X_test_ak)
pred_keras = list(chain(*pred_keras))
pred_keras2 = [i if i <= 5 else 5 for i in pred_keras]
pred_keras2 = [i if i >= 1 else 1 for i in pred_keras2]
pred_keras2 = [round(i) for i in pred_keras2]

# Compute the RMSE
rmse_keras = mean_squared_error(y_test_ak, pred_keras2)**0.5
print('RMSE: ' + str(rmse_keras))

In [None]:
# Prediction results
print('Confusion Matrix')
pd.DataFrame(confusion_matrix(y_test, pred_keras2), index=[1,2,3,4,5], columns=[1,2,3,4,5])