# Importing libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Text preprocessing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Machine learning and evaluation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

# For handling imbalanced data (SMOTE)
from imblearn.over_sampling import SMOTE

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Download necessary NLTK resources (only once)
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Text Preprocessing

In [None]:
!pip install spacy




In [None]:
# Import necessary libraries
import pandas as pd
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load the dataset
df = pd.read_csv('/content/reviews_segment.csv', delimiter=',', encoding='ISO-8859-1', quotechar="'", engine='python', on_bad_lines='skip')

# Display the first few rows
print(df.head())

# Define text preprocessing functions

def preprocess_text(text):
    # 1. Tokenization (Sentence and Word Level)
    sentences = sent_tokenize(text)  # Sentence tokenization
    word_tokens = [word_tokenize(sentence) for sentence in sentences]  # Word tokenization for each sentence

    # 2. Stopword Removal
    stop_words = set(stopwords.words('english'))
    filtered_words = [[word for word in words if word.lower() not in stop_words and word.isalpha()] for words in word_tokens]

    # 3. Lemmatization, POS Tagging, and Named Entity Recognition
    lemmatized_text = []
    pos_tags = []
    named_entities = []

    for sentence in sentences:
        doc = nlp(sentence)  # Process sentence with spaCy
        for token in doc:
            # Lemmatization
            lemmatized_text.append(token.lemma_)
            # POS Tagging
            pos_tags.append((token.text, token.pos_))
            # Named Entity Recognition
            if token.ent_type_:
                named_entities.append((token.text, token.ent_type_))

    return {
        'sentences': sentences,
        'filtered_words': filtered_words,
        'lemmatized_text': lemmatized_text,
        'pos_tags': pos_tags,
        'named_entities': named_entities
    }

# Apply the function to the 'review_text' column
df['preprocessed_reviews'] = df['review_text'].apply(lambda x: preprocess_text(str(x)))

# Display sample processed data
print(df[['review_text', 'preprocessed_reviews']].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                                                                                                                                                                                                                                                                               review_id  \
R10019MUX6F9A  B00006881R AWNC1GQ75W8K8  Works as advertised    2002-12-17 Neil            TeleZapper TZ 900 (Office Product)                 "'I\'ve had this product for about a month and ...  which to me is more desirable than having to t... 7  7  4 0 0 0                  "('R10019MUX6F9A'        'B00006881R'   
R100523NBIQIEV B000070MRB A2DKAPBHZ5DERR Neutral                2004-06-07 S. Barnes       Game Programming Starter Kit 6.0 (CD-ROM)          "'If you plan on getting this program go to htt...  download the free compiler and learn how to us... -1 -1 3 0 0 0                  "('R100523NBIQIEV'       'B000070MRB'   
R1007LULU4W7YH B000E5E6KG A1031ZUDNBOON0 More Junk f

# 2. Dependency Parsing & Syntactic Analysis

In [5]:
import spacy
from nltk import Tree

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Function to convert dependency parse to a Tree structure
def tok_format(tok):
    return f'{tok.orth_} ({tok.dep_})'

def to_nltk_tree(node):
    # Recursively builds a Treebank structure for each token
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)

# Dependency Parsing & Syntactic Analysis Function
def dependency_syntactic_analysis(text):
    # Parse the text with spaCy
    doc = nlp(text)

    # 1. Dependency Parsing: Extract relationships between words
    dependency_parsing = [(token.text, token.dep_, token.head.text) for token in doc]

    # 2. Syntactic Parsing (Treebank structure): Convert to tree format
    syntactic_tree = [to_nltk_tree(sent.root) for sent in doc.sents]

    return {
        'dependency_parsing': dependency_parsing,
        'syntactic_tree': syntactic_tree
    }

# Apply the function to the 'review_text' column
df['syntax_analysis'] = df['review_text'].apply(lambda x: dependency_syntactic_analysis(str(x)))

# Display results for a single row
print(df[['review_text', 'syntax_analysis']].head(1))


                                                                                                                                                                                                                                                  review_text  \
R10019MUX6F9A B00006881R AWNC1GQ75W8K8 Works as advertised 2002-12-17 Neil TeleZapper TZ 900 (Office Product) "'I\'ve had this product for about a month and ... which to me is more desirable than having to ta... 7 7 4 0 0 0 "('R10019MUX6F9A'      'Neil'   

                                                                                                                                                                                                                                                                                     syntax_analysis  
R10019MUX6F9A B00006881R AWNC1GQ75W8K8 Works as advertised 2002-12-17 Neil TeleZapper TZ 900 (Office Product) "'I\'ve had this product for about a month and ... which to me is more desirable

# 3. Sentiment Classification

In [6]:
# Convert 'customer_review_rating' to numeric, setting errors='coerce' to handle non-numeric values
df['customer_review_rating'] = pd.to_numeric(df['customer_review_rating'], errors='coerce')

# Drop rows where 'customer_review_rating' is NaN after conversion (non-numeric entries)
df = df.dropna(subset=['customer_review_rating'])

# Convert the ratings to integer
df.loc[:, 'customer_review_rating'] = df['customer_review_rating'].astype(int)

# Define sentiment labels based on customer_review_rating
# Assuming ratings 3 and above as positive (1), below 3 as negative (0)
df.loc[:, 'sentiment'] = df['customer_review_rating'].apply(lambda x: 1 if x >= 3 else 0)

# Check label distribution (optional, to verify balance)
print("Sentiment Label Distribution:\n", df['sentiment'].value_counts())

# Check unique values in 'customer_review_rating' to understand the range of ratings
print("Unique values in customer_review_rating:\n", df['customer_review_rating'].unique())

# Assign customer_review_rating directly as sentiment labels
df['sentiment'] = df['customer_review_rating']

# Check the updated sentiment label distribution
sentiment_counts = df['sentiment'].value_counts()
print("Sentiment Label Distribution:\n", sentiment_counts)


Sentiment Label Distribution:
 sentiment
0    4781
Name: count, dtype: int64
Unique values in customer_review_rating:
 [0. 1.]
Sentiment Label Distribution:
 sentiment
1.0    3394
0.0    1387
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'sentiment'] = df['customer_review_rating'].apply(lambda x: 1 if x >= 3 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['customer_review_rating']


In [7]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review_text'], df['sentiment'], test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Transform the text data into TF-IDF features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test_tfidf)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

         0.0       0.48      0.17      0.25       277
         1.0       0.73      0.93      0.82       680

    accuracy                           0.71       957
   macro avg       0.61      0.55      0.53       957
weighted avg       0.66      0.71      0.65       957

Confusion Matrix:
 [[ 47 230]
 [ 50 630]]


In [8]:
# Data manipulation
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review_text'], df['sentiment'], test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Transform the text data into TF-IDF features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_tfidf_sm, y_train_sm = smote.fit_resample(X_train_tfidf, y_train)

# Define and train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_tfidf_sm, y_train_sm)
y_pred_log_reg = log_reg.predict(X_test_tfidf)

# Define and train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf_sm, y_train_sm)
y_pred_nb = nb_model.predict(X_test_tfidf)

# Evaluation for Logistic Regression
print("Logistic Regression Model:")
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))

# Evaluation for Naive Bayes
print("\nNaive Bayes Model:")
print("Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))


Logistic Regression Model:
Classification Report:
               precision    recall  f1-score   support

         0.0       0.40      0.52      0.45       277
         1.0       0.78      0.68      0.73       680

    accuracy                           0.64       957
   macro avg       0.59      0.60      0.59       957
weighted avg       0.67      0.64      0.65       957

Confusion Matrix:
 [[145 132]
 [216 464]]

Naive Bayes Model:
Classification Report:
               precision    recall  f1-score   support

         0.0       0.39      0.61      0.48       277
         1.0       0.80      0.61      0.69       680

    accuracy                           0.61       957
   macro avg       0.59      0.61      0.58       957
weighted avg       0.68      0.61      0.63       957

Confusion Matrix:
 [[170 107]
 [264 416]]


In [92]:
from sklearn.model_selection import GridSearchCV

# Logistic Regression hyperparameter tuning
log_reg_params = {
    'C': [0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers
    'max_iter': [1000, 2000]
}

log_reg_grid_search = GridSearchCV(LogisticRegression(random_state=42), log_reg_params, cv=5, scoring='accuracy')
log_reg_grid_search.fit(X_train_tfidf_sm, y_train_sm)

# Best parameters and evaluation
print("Best parameters for Logistic Regression:", log_reg_grid_search.best_params_)
y_pred_log_reg = log_reg_grid_search.predict(X_test_tfidf)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg))
print(confusion_matrix(y_test, y_pred_log_reg))


  and should_run_async(code)


Best parameters for Logistic Regression: {'C': 10, 'max_iter': 1000, 'solver': 'saga'}
Logistic Regression Classification Report:
              precision    recall  f1-score   support

         0.0       0.39      0.49      0.44       277
         1.0       0.77      0.69      0.73       680

    accuracy                           0.63       957
   macro avg       0.58      0.59      0.58       957
weighted avg       0.66      0.63      0.64       957

[[137 140]
 [212 468]]


In [93]:
# Naive Bayes hyperparameter tuning
nb_params = {
    'alpha': [0.5, 1.0, 1.5],  # Smoothing parameter
}

nb_grid_search = GridSearchCV(MultinomialNB(), nb_params, cv=5, scoring='accuracy')
nb_grid_search.fit(X_train_tfidf_sm, y_train_sm)

# Best parameters and evaluation
print("Best parameters for Naive Bayes:", nb_grid_search.best_params_)
y_pred_nb = nb_grid_search.predict(X_test_tfidf)

print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))
print(confusion_matrix(y_test, y_pred_nb))


Best parameters for Naive Bayes: {'alpha': 0.5}
Naive Bayes Classification Report:
              precision    recall  f1-score   support

         0.0       0.39      0.59      0.47       277
         1.0       0.79      0.62      0.70       680

    accuracy                           0.61       957
   macro avg       0.59      0.61      0.58       957
weighted avg       0.67      0.61      0.63       957

[[163 114]
 [255 425]]


  and should_run_async(code)


In [94]:
from sklearn.svm import SVC

# SVM model with linear kernel
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf_sm, y_train_sm)
y_pred_svm = svm_model.predict(X_test_tfidf)

print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))


  and should_run_async(code)


SVM Classification Report:
              precision    recall  f1-score   support

         0.0       0.38      0.52      0.44       277
         1.0       0.77      0.66      0.71       680

    accuracy                           0.62       957
   macro avg       0.58      0.59      0.58       957
weighted avg       0.66      0.62      0.63       957

[[144 133]
 [232 448]]


In [97]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Confusion Matrix for Logistic Regression (example)
# Format: [[TN, FP], [FN, TP]]
conf_matrix = np.array([[137, 140], [212, 468]])

# Extract True Positives (TP), False Positives (FP), False Negatives (FN), True Negatives (TN)
TN, FP, FN, TP = conf_matrix.ravel()

# Calculate Precision, Recall, and F1-Score for class 1 (positive class)
precision = TP / (TP + FP)  # Precision for positive class
recall = TP / (TP + FN)     # Recall for positive class
f1_score = 2 * (precision * recall) / (precision + recall)  # F1-Score for positive class

# Output the metrics
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1_score:.2f}")

# Optionally, calculate for class 0 (negative class) as well
precision_class_0 = TN / (TN + FN)  # Precision for negative class
recall_class_0 = TN / (TN + FP)     # Recall for negative class
f1_score_class_0 = 2 * (precision_class_0 * recall_class_0) / (precision_class_0 + recall_class_0)  # F1-Score for negative class

# Output the metrics for class 0
print(f"Precision for class 0: {precision_class_0:.2f}")
print(f"Recall for class 0: {recall_class_0:.2f}")
print(f"F1-Score for class 0: {f1_score_class_0:.2f}")


Precision: 0.77
Recall: 0.69
F1-Score: 0.73
Precision for class 0: 0.39
Recall for class 0: 0.49
F1-Score for class 0: 0.44


  and should_run_async(code)


# Boolean Search & Rating Filtering

In [13]:
# Check for reviews containing the word "good"
contains_good = df[df['review_text'].str.contains('good', case=False, na=False)]
print("Reviews containing 'good':\n", contains_good[['review_text', 'customer_review_rating', 'sentiment']])

# Check for reviews containing the word "quality"
contains_quality = df[df['review_text'].str.contains('quality', case=False, na=False)]
print("Reviews containing 'quality':\n", contains_quality[['review_text', 'customer_review_rating', 'sentiment']])


Reviews containing 'good':
                                                                                                                                                                                                                                                                                                                            review_text  \
R10H8M9OOI40NB B000B0WO4O A1LLNZM7WSFP2M Good Info On Wal-Mart              2008-04-06 JHW                                      Frontline: Is Wal-Mart Good for America? (DVD)     Many of us know what Wal-Mart has done to small... 1  2  4 0 1 0 "('R10H8M9OOI40NB' 'B000B0WO4O'   'Frontline: Is Wal-Mart Good for America? (DVD)'   
R13XO0KVFKY6KD 0060875496 APIFRHB3KCB89  Fair Tax                           2007-09-11 Paul B. Jones                            The Fair Tax Book: Saying Goodbye to the Income... This is a very clear and concise book on how th... 2  5  5 0 1 0 "('R13XO0KVFKY6KD' '0060875496'   'The Fair Tax Book: Saying Goodbye

In [16]:
keyword_only_results = df[df['review_text'].str.contains("good", case=False, na=False)]
print("Results with only the keyword 'good':\n", keyword_only_results[['review_text', 'customer_review_rating', 'sentiment']])


Results with only the keyword 'good':
                                                                                                                                                                                                                                                                                                                            review_text  \
R10H8M9OOI40NB B000B0WO4O A1LLNZM7WSFP2M Good Info On Wal-Mart              2008-04-06 JHW                                      Frontline: Is Wal-Mart Good for America? (DVD)     Many of us know what Wal-Mart has done to small... 1  2  4 0 1 0 "('R10H8M9OOI40NB' 'B000B0WO4O'   'Frontline: Is Wal-Mart Good for America? (DVD)'   
R13XO0KVFKY6KD 0060875496 APIFRHB3KCB89  Fair Tax                           2007-09-11 Paul B. Jones                            The Fair Tax Book: Saying Goodbye to the Income... This is a very clear and concise book on how th... 2  5  5 0 1 0 "('R13XO0KVFKY6KD' '0060875496'   'The Fair Tax Book: Say

In [17]:
print("Unique values in customer_review_rating:\n", df['customer_review_rating'].unique())
print("Unique values in sentiment:\n", df['sentiment'].unique())


Unique values in customer_review_rating:
 [0. 1.]
Unique values in sentiment:
 [0. 1.]


In [19]:
# Adjusted Boolean Search Function
def boolean_search_adjusted(df, query, rating_filter=None, sentiment_filter=None):
    # Apply keyword search
    keyword_filtered = df[df['review_text'].str.contains(query, case=False, na=False)]

    # Apply rating filter if specified and available
    if rating_filter in keyword_filtered['customer_review_rating'].values:
        keyword_filtered = keyword_filtered[keyword_filtered['customer_review_rating'] == rating_filter]

    # Apply sentiment filter if specified
    if sentiment_filter is not None:
        keyword_filtered = keyword_filtered[keyword_filtered['sentiment'] == sentiment_filter]

    return keyword_filtered

# Test the adjusted function
query = "good"
filtered_reviews = boolean_search_adjusted(df, query, sentiment_filter=1)

# Display the results
print("Filtered Reviews with adjusted criteria:\n", filtered_reviews[['review_text', 'customer_review_rating', 'sentiment']])
print("\nTotal Matches:", len(filtered_reviews))


Filtered Reviews with adjusted criteria:
                                                                                                                                                                                                                                                                                                          review_text  \
R10H8M9OOI40NB B000B0WO4O A1LLNZM7WSFP2M Good Info On Wal-Mart        2008-04-06 JHW                          Frontline: Is Wal-Mart Good for America? (DVD)     Many of us know what Wal-Mart has done to small... 1  2  4 0 1 0 "('R10H8M9OOI40NB' 'B000B0WO4O'   'Frontline: Is Wal-Mart Good for America? (DVD)'   
R13XO0KVFKY6KD 0060875496 APIFRHB3KCB89  Fair Tax                     2007-09-11 Paul B. Jones                The Fair Tax Book: Saying Goodbye to the Income... This is a very clear and concise book on how th... 2  5  5 0 1 0 "('R13XO0KVFKY6KD' '0060875496'   'The Fair Tax Book: Saying Goodbye to the Inc...   
R1BHW5ZWR5ZDO6 B0001YH

# N-gram Generation

In [41]:
# Filter reviews with ratings greater than or equal to 3
filtered_df = df[df['rating'] >= 3]

# Display the result
print(filtered_df.shape)
print(filtered_df.head())


(1, 3)
     customer_id                                        review_text  rating
0  R10019MUX6F9A  I've had this product for about a month and it...     7.0


In [42]:
# Check for missing values in the rating column
missing_ratings = df['rating'].isnull().sum()
print(f"Missing ratings: {missing_ratings}")


Missing ratings: 0


In [48]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk

# Sample reviews from dataset
data = {
    'customer_id': ['R10019MUX6F9A', 'R100523NBIQIEV', 'R1007LULU4W7YH', 'R100CNB1MEHAG3', 'R100D2CV4WK16J'],
    'review_text': [
        "I've had this product for about a month and it works great.",
        "If you plan on getting this program, go to http://example.com",
        "Look at the review by C-Net and you start to realize it's not worth the price.",
        "Multi stink in all new CoD ... well ever since the update.",
        "If you remember the original Mr. Salty pretzels, these are not the same."
    ],
    'rating': [7.0, 5.0, -1.0, 0.0, 2.0]
}


# Create DataFrame from the sample data
df = pd.DataFrame(data)

# Step 1: Filter reviews with ratings >= 0 (positive or neutral reviews)
filtered_df = df[df['rating'] >= 0]

# Step 2: Check if we have any missing or invalid ratings (if necessary)
missing_ratings = filtered_df['rating'].isnull().sum()

# Step 3: Define a function to generate N-grams (bigrams, trigrams, etc.)
def generate_ngrams(texts, ngram_range=(2, 3)):
    """
    Function to generate N-grams from a list of texts.
    :param texts: List of review texts.
    :param ngram_range: Tuple (min_n, max_n) defining the range of N-grams to generate.
    :return: DataFrame with the generated N-grams.
    """
    vectorizer = CountVectorizer(ngram_range=(1, 4))  # This will generate unigrams, bigrams, trigrams, and 4-grams
    ngrams = vectorizer.fit_transform(texts)
    ngrams_df = pd.DataFrame(ngrams.toarray(), columns=vectorizer.get_feature_names_out())
    return ngrams_df

# Step 4: Generate bigrams and trigrams from the 'review_text' column of the filtered DataFrame
ngram_range = (2, 3)  # Bigrams and trigrams
ngram_df = generate_ngrams(filtered_df['review_text'], ngram_range=ngram_range)

# Step 5: Display the filtered reviews and the generated N-grams
print("Filtered Reviews:")
print(filtered_df[['customer_id', 'review_text', 'rating']])

print("\nGenerated N-grams (Bigrams and Trigrams):")
print(ngram_df)


Filtered Reviews:
      customer_id                                        review_text  rating
0   R10019MUX6F9A  I've had this product for about a month and it...     7.0
1  R100523NBIQIEV  If you plan on getting this program, go to htt...     5.0
3  R100CNB1MEHAG3  Multi stink in all new CoD ... well ever since...     0.0
4  R100D2CV4WK16J  If you remember the original Mr. Salty pretzel...     2.0

Generated N-grams (Bigrams and Trigrams):
   about  about month  about month and  about month and it  all  all new  \
0      1            1                1                   1    0        0   
1      0            0                0                   0    0        0   
2      0            0                0                   0    1        1   
3      0            0                0                   0    0        0   

   all new cod  all new cod well  and  and it  ...  well ever since the  \
0            0                 0    1       1  ...                    0   
1            0         

# Topic Modeling

In [49]:
pip install gensim pyLDAvis


Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [51]:
import pandas as pd
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis.gensim_models

# Step 1: Prepare the reviews (Assuming you already have the filtered reviews)
reviews = [
    "I've had this product for about a month and it works great.",
    "If you plan on getting this program, go to http://example.com",
    "Multi stink in all new CoD ... well ever since the update.",
    "If you remember the original Mr. Salty pretzels, these are not the same."
]

# Step 2: Tokenize and preprocess the text (remove stopwords, lowercase, etc.)
# We can also perform lemmatization here if needed (using SpaCy or NLTK)
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 3))  # Unigrams, bigrams, and trigrams
X = vectorizer.fit_transform(reviews)

# Step 3: Convert the text to a format suitable for LDA (bag-of-words)
# Use the vectorizer to get the vocabulary
vocabulary = vectorizer.get_feature_names_out()
corpus = [text.split() for text in reviews]

# Step 4: Create the Gensim Dictionary
dictionary = corpora.Dictionary(corpus)

# Step 5: Create the bag-of-words format
corpus_bow = [dictionary.doc2bow(text) for text in corpus]

# Step 6: Train the LDA model
lda_model = gensim.models.LdaMulticore(corpus_bow, num_topics=3, id2word=dictionary, passes=15)

# Step 7: Display topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

# Step 8: Visualize topics with pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus_bow, dictionary)
pyLDAvis.display(vis)


  and should_run_async(code)


(0, '0.051*"Multi" + 0.051*"update." + 0.051*"..." + 0.051*"the" + 0.051*"CoD"')
(1, '0.064*"this" + 0.038*"go" + 0.038*"plan" + 0.038*"getting" + 0.038*"to"')
(2, '0.084*"the" + 0.048*"you" + 0.048*"If" + 0.048*"not" + 0.048*"these"')


# Result Evaluation

In [72]:
import re

def clean_review_text(text):
    # Extract review text by removing non-textual elements, such as product info or identifiers
    # This is a simple regex example to keep words after a product title and possible metadata
    clean_text = re.sub(r'\d{5,}', '', text)  # Remove long sequences of numbers (e.g., product IDs)
    clean_text = re.sub(r'[^\w\s]', '', clean_text)  # Remove punctuation
    return clean_text.strip()

def baseline_search(query, reviews_df):
    # Split the query into terms, ensuring no tuple wrapping
    query_terms = query.split(":")[1].split()  # Extract terms after the colon (e.g., "poor" from "audio quality:poor")
    print(f"Query Terms: {query_terms}")  # Debugging step to see the extracted terms

    # Clean and process review text
    reviews_df['clean_review_text'] = reviews_df['review_text'].apply(clean_review_text)

    # Checking the cleaned review text to see what we are matching against
    print(f"Sample of Cleaned Review Text:\n{reviews_df['clean_review_text'].head()}")  # Check the first few cleaned review texts

    # Checking review matches using substring match (lowercase the text to avoid case issues)
    relevant_reviews = reviews_df[reviews_df['clean_review_text'].apply(lambda x: all(term in x.lower() for term in query_terms))]

    # Debugging to see some reviews
    print(f"Number of Reviews Matched: {len(relevant_reviews)}")
    if len(relevant_reviews) > 0:
        print(f"Sample Matched Reviews:\n{relevant_reviews['clean_review_text'].head()}")  # Debugging step to see a few matched reviews

    return relevant_reviews

# Example for multiple queries
queries = [
    "audio quality:poor",
    "wifi signal:strong",
    "mouse button:click problem",
    "gps map:useful",
    "image quality:sharp"
]

# Apply search for each query
for query in queries:
    relevant_reviews_baseline = baseline_search(query, filtered_reviews)
    print(f"Retrieved {len(relevant_reviews_baseline)} reviews for query '{query}'\n")


Query Terms: ['poor']
Sample of Cleaned Review Text:
R10H8M9OOI40NB  B000B0WO4O  A1LLNZM7WSFP2M  Good Info On Wal-Mart  2008-04-06  JHW                          Frontline: Is Wal-Mart Good for America? (DVD)                                                                                         Many of us know what Wal-Mart has done to small retail across America. What I did not know until I saw this documentary was what it has done to manufacturing in America with its incredible determination to have as much of everything as possible made in China. The world will look back on this time as one of great change and on Wal-Mart as the key reagent in that change.  1   2   4  0  1  0  "('R10H8M9OOI40NB'  'B000B0WO4O'            Frontline Is WalMart Good for America DVD
R13XO0KVFKY6KD  0060875496  APIFRHB3KCB89   Fair Tax               2007-09-11  Paul B. Jones                The Fair Tax Book: Saying Goodbye to the Income Tax and the IRS (Paperback)                                          

  and should_run_async(code)


In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def m1_tfidf_cosine(query, reviews_df):
    # Vectorize the reviews using TF-IDF
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(reviews_df['review_text'])

    # Vectorize the query
    query_tfidf = tfidf_vectorizer.transform([query])

    # Compute cosine similarities between the query and reviews
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()

    # Rank the reviews by similarity
    ranked_reviews = reviews_df.iloc[cosine_similarities.argsort()[::-1]]

    return ranked_reviews, cosine_similarities

# List of queries
queries = [
    "audio quality:poor",
    "wifi signal:strong",
    "mouse button:click problem",
    "gps map:useful",
    "image quality:sharp"
]

# Loop through each query and get the ranked reviews
for query in queries:
    ranked_reviews_m1, cosine_similarities = m1_tfidf_cosine(query, filtered_reviews)
    print(f"Retrieved {len(ranked_reviews_m1)} reviews for query '{query}' using M1 (TF-IDF + Cosine Similarity)\n")
    print(f"Sample Matched Reviews for query '{query}':\n")
    print(ranked_reviews_m1[['review_text']].head())  # Display the top 5 matched reviews for each query
    print("\n" + "-"*80 + "\n")  # Separator between queries


  and should_run_async(code)


Retrieved 8 reviews for query 'audio quality:poor' using M1 (TF-IDF + Cosine Similarity)

Sample Matched Reviews for query 'audio quality:poor':

                                                                                                                                                                                                                                                                                                         review_text
R1V11IHN5HSVP9 0060897376 A34J1DRI3F9AF0 wonderful                    2008-04-29 Gayle S. Graziano            Why Good People Do Bad Things: How to Stop Bein... Everyone should read this book!!!Espcially BEFO... 2  3  5 0 1 0 "('R1V11IHN5HSVP9' '0060897376'   'Why Good People Do Bad Things: How to Stop B...
R1SJ1ZZTERYYTG 0967674727 AJE825U0FF388  breakthrough on the new skis 2009-12-01 "'B. Aldrich ""Billy Bob""'" Breakthrough on the New Skis 3 Ed: Say Goodbye ... This is great stuff.  Now skiing makes sense.  ... -1 -1 4 0 1 0 "('R1SJ1ZZ

In [85]:
import re
import gensim
from gensim import corpora
from sklearn.metrics.pairwise import cosine_similarity

# Clean review text function (same as before)
def clean_review_text(text):
    clean_text = re.sub(r'\d{5,}', '', text)  # Remove long sequences of numbers (e.g., product IDs)
    clean_text = re.sub(r'[^\w\s]', '', clean_text)  # Remove punctuation
    return clean_text.strip()

# Method 2: LDA + Cosine Similarity
def m2_lda_cosine(query, reviews_df, num_topics=5):
    # Clean and preprocess the reviews
    reviews_df['clean_review_text'] = reviews_df['review_text'].apply(clean_review_text)

    # Tokenize the reviews
    reviews_df['tokens'] = reviews_df['clean_review_text'].apply(lambda x: x.split())

    # Create a dictionary and corpus for the LDA model
    dictionary = corpora.Dictionary(reviews_df['tokens'])
    corpus = [dictionary.doc2bow(text) for text in reviews_df['tokens']]

    # Train the LDA model on the reviews
    lda_model = gensim.models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

    # Clean and preprocess the query
    query_clean = clean_review_text(query)
    query_tokens = query_clean.split()

    # Create a bag-of-words representation for the query
    query_bow = dictionary.doc2bow(query_tokens)

    # Get the topic distribution for the query
    query_topics = lda_model[query_bow]
    query_topic_dist = [0] * num_topics
    for topic_num, prob in query_topics:
        query_topic_dist[topic_num] = prob

    # Compute cosine similarities between the query topic distribution and each review topic distribution
    review_topic_dists = []
    for review_bow in corpus:
        review_topics = lda_model[review_bow]
        review_topic_dist = [0] * num_topics
        for topic_num, prob in review_topics:
            review_topic_dist[topic_num] = prob
        review_topic_dists.append(review_topic_dist)

    # Calculate cosine similarity between the query and each review
    cosine_similarities = cosine_similarity([query_topic_dist], review_topic_dists).flatten()

    # Rank the reviews by cosine similarity
    ranked_reviews = reviews_df.iloc[cosine_similarities.argsort()[::-1]]

    return ranked_reviews, cosine_similarities

# List of queries
queries = [
    "audio quality:poor",
    "wifi signal:strong",
    "mouse button:click problem",
    "gps map:useful",
    "image quality:sharp"
]

# Loop through each query and get the ranked reviews for Method 2 (LDA + Cosine Similarity)
for query in queries:
    ranked_reviews_m2, similarities_m2 = m2_lda_cosine(query, filtered_reviews)
    print(f"Retrieved {len(ranked_reviews_m2)} reviews for query '{query}' using M2 (LDA + Cosine Similarity)\n")
    print(f"Sample Matched Reviews for query '{query}':\n")
    print(ranked_reviews_m2[['review_text']].head())  # Display the top 5 matched reviews for each query
    print("\n" + "-"*80 + "\n")  # Separator between queries


  and should_run_async(code)


Retrieved 8 reviews for query 'audio quality:poor' using M2 (LDA + Cosine Similarity)

Sample Matched Reviews for query 'audio quality:poor':

                                                                                                                                                                                                                                                                                                        review_text
R1L2XNYQFCTD0D B000JPXRGK A18Y3IXQ0NZ3Q  Love it                      2007-10-20 "'L. Page ""Biblioholic""'" OXO Good Grips Silicone Flexible Omelet Turner ... This is wonderful but you have to get used to i... 3  3  5 0 1 0 "('R1L2XNYQFCTD0D' 'B000JPXRGK'   'OXO Good Grips Silicone Flexible Omelet Turn...
R10H8M9OOI40NB B000B0WO4O A1LLNZM7WSFP2M Good Info On Wal-Mart        2008-04-06 JHW                         Frontline: Is Wal-Mart Good for America? (DVD)     Many of us know what Wal-Mart has done to small... 1  2  4 0 1 0 "('R10H8M9OOI40N

In [87]:
import re
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Clean review text function (same as before)
def clean_review_text(text):
    clean_text = re.sub(r'\d{5,}', '', text)  # Remove long sequences of numbers (e.g., product IDs)
    clean_text = re.sub(r'[^\w\s]', '', clean_text)  # Remove punctuation
    return clean_text.strip()

# Method 3: Word2Vec + Cosine Similarity with Error Handling
def m3_word2vec_cosine(query, reviews_df, vector_size=100, window=5, min_count=1, workers=4):
    # Clean and preprocess the reviews
    reviews_df['clean_review_text'] = reviews_df['review_text'].apply(clean_review_text)

    # Tokenize the reviews
    reviews_df['tokens'] = reviews_df['clean_review_text'].apply(lambda x: x.split())

    # Train the Word2Vec model on the reviews
    model = Word2Vec(sentences=reviews_df['tokens'], vector_size=vector_size, window=window, min_count=min_count, workers=workers)

    # Clean and preprocess the query
    query_clean = clean_review_text(query)
    query_tokens = query_clean.split()

    # Convert the query to a vector by averaging the word vectors (skip missing words)
    query_vectors = [model.wv[word] for word in query_tokens if word in model.wv]
    if query_vectors:
        query_vector = np.mean(query_vectors, axis=0)
    else:
        query_vector = np.zeros(model.vector_size)  # Default to zero vector if no words are in vocabulary

    # Compute cosine similarity between the query vector and each review's vector
    review_vectors = []
    for review_tokens in reviews_df['tokens']:
        review_vectors_list = [model.wv[word] for word in review_tokens if word in model.wv]
        if review_vectors_list:
            review_vector = np.mean(review_vectors_list, axis=0)
        else:
            review_vector = np.zeros(model.vector_size)  # Default to zero vector if no words are in vocabulary
        review_vectors.append(review_vector)

    # Calculate cosine similarities between the query vector and each review's vector
    cosine_similarities = cosine_similarity([query_vector], review_vectors).flatten()

    # Rank the reviews by cosine similarity
    ranked_reviews = reviews_df.iloc[cosine_similarities.argsort()[::-1]]

    return ranked_reviews, cosine_similarities

# List of queries
queries = [
    "audio quality:poor",
    "wifi signal:strong",
    "mouse button:click problem",
    "gps map:useful",
    "image quality:sharp"
]

# Loop through each query and get the ranked reviews for Method 3 (Word2Vec + Cosine Similarity)
for query in queries:
    ranked_reviews_m3, similarities_m3 = m3_word2vec_cosine(query, filtered_reviews)
    print(f"Retrieved {len(ranked_reviews_m3)} reviews for query '{query}' using M3 (Word2Vec + Cosine Similarity)\n")
    print(f"Sample Matched Reviews for query '{query}':\n")
    print(ranked_reviews_m3[['review_text']].head())  # Display the top 5 matched reviews for each query
    print("\n" + "-"*80 + "\n")  # Separator between queries


Retrieved 8 reviews for query 'audio quality:poor' using M3 (Word2Vec + Cosine Similarity)

Sample Matched Reviews for query 'audio quality:poor':

                                                                                                                                                                                                                                                                                                         review_text
R1V11IHN5HSVP9 0060897376 A34J1DRI3F9AF0 wonderful                    2008-04-29 Gayle S. Graziano            Why Good People Do Bad Things: How to Stop Bein... Everyone should read this book!!!Espcially BEFO... 2  3  5 0 1 0 "('R1V11IHN5HSVP9' '0060897376'   'Why Good People Do Bad Things: How to Stop B...
R1SJ1ZZTERYYTG 0967674727 AJE825U0FF388  breakthrough on the new skis 2009-12-01 "'B. Aldrich ""Billy Bob""'" Breakthrough on the New Skis 3 Ed: Say Goodbye ... This is great stuff.  Now skiing makes sense.  ... -1 -1 4 0 1 0 "('R1SJ1

  and should_run_async(code)


In [90]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to calculate precision for each method
def calculate_precision(retrieved_reviews, query, query_terms, relevant_reviews_df):
    # Create a binary list where 1 means the review is relevant, 0 means it's not
    relevant_array = [1 if any(term in review.lower() for term in query_terms) else 0 for review in relevant_reviews_df['review_text']]

    # Create a binary list for retrieved reviews (1 if the review is relevant, 0 if not)
    retrieved_relevant = []
    for review in retrieved_reviews['review_text']:
        if any(term in review.lower() for term in query_terms):
            retrieved_relevant.append(1)
        else:
            retrieved_relevant.append(0)

    # Calculate Precision: (Relevant Documents Retrieved) / (Total Documents Retrieved)
    if len(retrieved_relevant) == 0:
        return 0.0  # Prevent division by zero
    precision = sum(retrieved_relevant) / len(retrieved_relevant)

    return precision


# Example: Loop through each query for baseline, m1, and m3 methods
queries = [
    "audio quality:poor",
    "wifi signal:strong",
    "mouse button:click problem",
    "gps map:useful",
    "image quality:sharp"
]

# Calculate Precision for Baseline Method
for query in queries:
    query_terms = query.split(":")[1].split()
    # Run Baseline Search
    relevant_reviews_baseline = baseline_search(query, filtered_reviews)
    precision_baseline = calculate_precision(relevant_reviews_baseline, query, query_terms, filtered_reviews)
    print(f"Precision for query '{query}' using Baseline Method: {precision_baseline:.4f}")


# Calculate Precision for M1 (TF-IDF + Cosine Similarity)
for query in queries:
    query_terms = query.split(":")[1].split()
    # Run M1 (TF-IDF + Cosine Similarity)
    ranked_reviews_m1, _ = m1_tfidf_cosine(query, filtered_reviews)
    precision_m1 = calculate_precision(ranked_reviews_m1, query, query_terms, filtered_reviews)
    print(f"Precision for query '{query}' using M1 (TF-IDF + Cosine Similarity): {precision_m1:.4f}")


# Calculate Precision for M3 (Word2Vec + Cosine Similarity)
for query in queries:
    query_terms = query.split(":")[1].split()
    # Run M3 (Word2Vec + Cosine Similarity)
    ranked_reviews_m3, _ = m3_word2vec_cosine(query, filtered_reviews)
    precision_m3 = calculate_precision(ranked_reviews_m3, query, query_terms, filtered_reviews)
    print(f"Precision for query '{query}' using M3 (Word2Vec + Cosine Similarity): {precision_m3:.4f}")


Query Terms: ['poor']
Sample of Cleaned Review Text:
R10H8M9OOI40NB  B000B0WO4O  A1LLNZM7WSFP2M  Good Info On Wal-Mart  2008-04-06  JHW                          Frontline: Is Wal-Mart Good for America? (DVD)                                                                                         Many of us know what Wal-Mart has done to small retail across America. What I did not know until I saw this documentary was what it has done to manufacturing in America with its incredible determination to have as much of everything as possible made in China. The world will look back on this time as one of great change and on Wal-Mart as the key reagent in that change.  1   2   4  0  1  0  "('R10H8M9OOI40NB'  'B000B0WO4O'            Frontline Is WalMart Good for America DVD
R13XO0KVFKY6KD  0060875496  APIFRHB3KCB89   Fair Tax               2007-09-11  Paul B. Jones                The Fair Tax Book: Saying Goodbye to the Income Tax and the IRS (Paperback)                                          

  and should_run_async(code)
