#Sentiment Analysis for Customer Reviews



### Import necessary libraries

In [1]:
import spacy
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np
import string
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import joblib
import warnings
from collections import OrderedDict
warnings.filterwarnings('ignore')

### Download necessary NLTK resources

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

### Initialize NLP and sentiment analysis tools

In [3]:
nlp = spacy.load("en_core_web_sm")
sid = SentimentIntensityAnalyzer()

### Function to map POS tags to WordNet format for lemmatization

In [4]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

### Function to clean the text (lowercase, remove punctuation, stopwords, lemmatization, etc.)

In [5]:
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = [word.strip(string.punctuation) for word in text.split(" ")]  # Tokenize and remove punctuation
    text = [word for word in text if not any(c.isdigit() for c in word)]  # Remove words with numbers
    stop = stopwords.words('english')  # Define stopwords
    text = [x for x in text if x not in stop]  # Remove stopwords
    pos_tags = pos_tag(text)  # POS tagging
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]  # Lemmatization
    text = [t for t in text if len(t) > 1]  # Remove words with only one letter
    return " ".join(text)

### Import sentiment data

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load and preprocess sentiment data

In [None]:
sentiment_df = pd.read_json('/content/drive/MyDrive/churn_prediction/data/reviews/Automotive_for_train.json')
sentiment_df = sentiment_df.dropna().replace(to_replace='None', value=np.nan).dropna()
sentiment_df = sentiment_df.rename(columns={"score": "is_neg", "content": "review"})
sentiment_df = sentiment_df.sample(frac=0.1, random_state=14)
sentiment_df['is_neg'] = sentiment_df['is_neg'].apply(lambda x: 1 if int(x) < 3 else 0)

### Clean review and summary text

In [10]:
sentiment_df["review_clean"] = sentiment_df["review"].apply(lambda x: clean_text(x))
sentiment_df["summary_clean"] = sentiment_df["summary"].apply(lambda x: clean_text(x))

### Apply VADER sentiment analysis to reviews and summaries

In [11]:

sentiment_df["sentiments"] = sentiment_df["review"].apply(lambda x: sid.polarity_scores(x))
sentiment_df = pd.concat([sentiment_df.drop(['sentiments'], axis=1), sentiment_df['sentiments'].apply(pd.Series)], axis=1)
sentiment_df = sentiment_df.rename(columns={"neg": "neg_rw", "neu": "neu_rw", "pos": "pos_rw", "compound": "compound_rw"})

sentiment_df["sentiments_sm"] = sentiment_df["summary"].apply(lambda x: sid.polarity_scores(x))
sentiment_df = pd.concat([sentiment_df.drop(['sentiments_sm'], axis=1), sentiment_df['sentiments_sm'].apply(pd.Series)], axis=1)
sentiment_df = sentiment_df.rename(columns={"neg": "neg_sm", "neu": "neu_sm", "pos": "pos_sm", "compound": "compound_sm"})

### Add additional features (number of characters and words)

In [12]:
sentiment_df["nb_chars"] = sentiment_df["review"].apply(lambda x: len(x))
sentiment_df["nb_chars_sm"] = sentiment_df["summary"].apply(lambda x: len(x))
sentiment_df["nb_words"] = sentiment_df["review"].apply(lambda x: len(x.split(" ")))
sentiment_df["nb_words_sm"] = sentiment_df["summary"].apply(lambda x: len(x.split(" ")))

### Train a Doc2Vec model and generate document vectors for reviews and summaries

In [13]:
reviews = sentiment_df["review_clean"].apply(lambda x: x.split(" "))
summaries = sentiment_df["summary_clean"].apply(lambda x: x.split(" "))
all_texts = reviews.tolist() + summaries.tolist()
documents = [TaggedDocument(words=doc, tags=[i]) for i, doc in enumerate(all_texts)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

### Generate Doc2Vec vectors for reviews and summaries

In [14]:
doc2vec_df = reviews.apply(lambda x: model.infer_vector(x)).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_rw_vector_" + str(x) for x in doc2vec_df.columns]
sentiment_df = pd.concat([sentiment_df, doc2vec_df], axis=1)

doc2vec_df_sm = summaries.apply(lambda x: model.infer_vector(x)).apply(pd.Series)
doc2vec_df_sm.columns = ["doc2vec_sm_vector_" + str(x) for x in doc2vec_df_sm.columns]
sentiment_df = pd.concat([sentiment_df, doc2vec_df_sm], axis=1)

### Combine cleaned review and summary for TF-IDF vectorization

In [15]:
merge_clean = sentiment_df[["review_clean", "summary_clean"]].apply(lambda x: " ".join(x), axis=1)
merge_clean = merge_clean.apply(lambda x: list(OrderedDict.fromkeys(x.split(" "))))
merge_clean = merge_clean.apply(lambda x: " ".join(x))

### Train and apply TF-IDF vectorizer

In [16]:
tfidf_model = TfidfVectorizer(min_df=10)
tfidf_model.fit(merge_clean)

tfidf_result = tfidf_model.transform(merge_clean).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf_model.get_feature_names_out())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = sentiment_df.index
sentiment_df = pd.concat([sentiment_df, tfidf_df], axis=1)

### Remove duplicate columns

In [17]:
sentiment_df = sentiment_df.loc[:, ~sentiment_df.columns.duplicated()].copy()

### Train-test split for the sentiment prediction model

In [18]:
label = "is_neg"
ignore_cols = [label, "review", "review_clean", "summary", "summary_clean"]
features = [c for c in sentiment_df.columns if c not in ignore_cols]
X_train, X_test, y_train, y_test = train_test_split(sentiment_df[features], sentiment_df[label], test_size=0.20, random_state=42)


### Train XGBoost model for sentiment classification

In [19]:
xgb = XGBClassifier(objective="binary:logistic", n_jobs=-1, n_estimators=1000, max_depth=10)
xgb.fit(X_train, y_train)

### Save trained models

In [20]:
joblib.dump(tfidf_model, 'sentimenta_tfidf_model')
joblib.dump(xgb, 'sentimenta_xgb_model.pkl')
model.save('sentimenta_doc2vec_model')

### Function for sentiment analysis on reviews and summary

In [21]:

def sentiment_feedback(sentiment_df, doc2vec_model, tfidf_vectorizer, xgb_model):
    """
    Processes customer reviews and summary to extract sentiment features and
    vector representations using TF-IDF and Doc2Vec, and predicts sentiment scores.

    Parameters:
    - sentiment_df: DataFrame containing review and summary text.
    - doc2vec_model: Trained Doc2Vec model for vector representation of text.
    - tfidf_vectorizer: Trained TF-IDF vectorizer.
    - xgb_model: Trained XGBoost model for sentiment prediction.

    Returns:
    - output_df: DataFrame containing predicted sentiment and sentiment score.
    """
    # Clean the text data in the 'review' and 'summary' columns
    sentiment_df["review_clean"] = sentiment_df["review"].apply(lambda x: clean_text(x))
    sentiment_df["summary_clean"] = sentiment_df["summary"].apply(lambda x: clean_text(x))

    # Apply VADER sentiment analysis on reviews and summaries
    sentiment_df["sentiments"] = sentiment_df["review"].apply(lambda x: sid.polarity_scores(x))
    sentiment_df = pd.concat([sentiment_df.drop(['sentiments'], axis=1), sentiment_df['sentiments'].apply(pd.Series)], axis=1)
    sentiment_df = sentiment_df.rename(columns={"neg": "neg_rw", "neu": "neu_rw", "pos": "pos_rw", "compound": "compound_rw"})

    sentiment_df["sentiments_sm"] = sentiment_df["summary"].apply(lambda x: sid.polarity_scores(x))
    sentiment_df = pd.concat([sentiment_df.drop(['sentiments_sm'], axis=1), sentiment_df['sentiments_sm'].apply(pd.Series)], axis=1)
    sentiment_df = sentiment_df.rename(columns={"neg": "neg_sm", "neu": "neu_sm", "pos": "pos_sm", "compound": "compound_sm"})

    # Add features for number of characters and words in the review and summary
    sentiment_df["nb_chars"] = sentiment_df["review"].apply(lambda x: len(x))
    sentiment_df["nb_chars_sm"] = sentiment_df["summary"].apply(lambda x: len(x))
    sentiment_df["nb_words"] = sentiment_df["review"].apply(lambda x: len(x.split(" ")))
    sentiment_df["nb_words_sm"] = sentiment_df["summary"].apply(lambda x: len(x.split(" ")))

    # Generate Doc2Vec vectors for review and summary text
    doc2vec_df = sentiment_df["review_clean"].apply(lambda x: doc2vec_model.infer_vector(x.split(" "))).apply(pd.Series)
    doc2vec_df.columns = ["doc2vec_rw_vector_" + str(x) for x in doc2vec_df.columns]
    sentiment_df = pd.concat([sentiment_df, doc2vec_df], axis=1)

    doc2vec_df_sm = sentiment_df["summary_clean"].apply(lambda x: doc2vec_model.infer_vector(x.split(" "))).apply(pd.Series)
    doc2vec_df_sm.columns = ["doc2vec_sm_vector_" + str(x) for x in doc2vec_df_sm.columns]
    sentiment_df = pd.concat([sentiment_df, doc2vec_df_sm], axis=1)

    # Combine cleaned review and summary for TF-IDF vectorization
    merge_clean = sentiment_df[["review_clean", "summary_clean"]].apply(lambda x: " ".join(x), axis=1)
    merge_clean = merge_clean.apply(lambda x: list(OrderedDict.fromkeys(x.split(" "))))
    merge_clean = merge_clean.apply(lambda x: " ".join(x))

    # Apply TF-IDF vectorizer
    tfidf_result = tfidf_vectorizer.transform(merge_clean).toarray()
    tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf_vectorizer.get_feature_names_out())
    tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
    tfidf_df.index = sentiment_df.index
    sentiment_df = pd.concat([sentiment_df, tfidf_df], axis=1)

    # Remove duplicate columns
    sentiment_df = sentiment_df.loc[:, ~sentiment_df.columns.duplicated()].copy()

    # Define features for sentiment prediction
    ignore_cols = ["review", "review_clean", "summary", "summary_clean"]
    features = [c for c in sentiment_df.columns if c not in ignore_cols]

    # Make predictions with XGBoost model
    X_test = sentiment_df[features]
    y_pred_list = xgb_model.predict_proba(X_test)
    y_pred = [np.max(x) for x in y_pred_list]
    y_pred_f = xgb_model.predict(X_test)
    y_pred_f = list(y_pred_f)

    # Prepare output DataFrame
    output_df = pd.DataFrame(list(zip(y_pred_f, y_pred)), columns=['sentiment', 'sent_score'])

    return output_df