In [1]:
import pandas as pd
import plotly.express as px

In [2]:
# Load the dataset
data = pd.read_csv('dataset/dataset.csv')

In [3]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,165257,B000EVG8J2,A1L01D2BD3RKVO,"B. Miller ""pet person""",0,0,5,1268179200,Crunchy & Good Gluten-Free Sandwich Cookies!,Having tried a couple of other brands of glute...
1,231466,B0000BXJIS,A3U62RE5XZDP0G,Marty,0,0,5,1298937600,great kitty treats,My cat loves these treats. If ever I can't fin...
2,427828,B008FHUFAU,AOXC0JQQZGGB6,Kenneth Shevlin,0,2,3,1224028800,COFFEE TASTE,A little less than I expected. It tends to ha...
3,433955,B006BXV14E,A3PWPNZVMNX3PA,rareoopdvds,0,1,2,1335312000,So the Mini-Wheats were too big?,"First there was Frosted Mini-Wheats, in origin..."
4,70261,B007I7Z3Z0,A1XNZ7PCE45KK7,Og8ys1,0,2,5,1334707200,Great Taste . . .,and I want to congratulate the graphic artist ...


In [4]:
# Data Cleaning
# Drop duplicate rows based on 'Id'
data = data.drop_duplicates(subset='Id')

In [5]:
data.shape

(30000, 10)

In [6]:
# Remove rows with missing values in critical columns
data = data.dropna(subset=['ProductId', 'UserId', 'ProfileName', 'Score', 'Text'])

In [7]:
# Convert 'Time' column to datetime format
data['Time'] = pd.to_datetime(data['Time'], unit='s')

In [8]:
# Remove rows where 'HelpfulnessDenominator' is zero to avoid division errors
data = data[data['HelpfulnessDenominator'] != 0]

In [9]:
# Create a new column for helpfulness ratio
data['HelpfulnessRatio'] = data['HelpfulnessNumerator'] / data['HelpfulnessDenominator']

In [10]:
# Save the cleaned dataset
data.to_csv('dataset/cleaned_dataset.csv', index=False)
print("Cleaned dataset saved as 'cleaned_dataset.csv'")

Cleaned dataset saved as 'cleaned_dataset.csv'


In [11]:
# EDA using Plotly
# Distribution of Scores
fig1 = px.histogram(data, x='Score', title='Distribution of Review Scores', labels={'Score': 'Review Score'}, nbins=5)
fig1.show()

In [12]:
# Helpfulness Ratio vs. Scores
fig2 = px.box(data, x='Score', y='HelpfulnessRatio', title='Helpfulness Ratio by Review Score',
              labels={'Score': 'Review Score', 'HelpfulnessRatio': 'Helpfulness Ratio'})
fig2.show()

In [13]:
# Number of Reviews over Time
fig3 = px.histogram(data, x='Time', title='Number of Reviews Over Time', labels={'Time': 'Review Time'}, nbins=50)
fig3.show()

In [14]:
# Top 10 Most Reviewed Products
top_products = data['ProductId'].value_counts().head(10).reset_index()
top_products.columns = ['ProductId', 'ReviewCount']
fig4 = px.bar(top_products, x='ProductId', y='ReviewCount', title='Top 10 Most Reviewed Products',
              labels={'ProductId': 'Product ID', 'ReviewCount': 'Number of Reviews'})
fig4.show()

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob

In [16]:
# Load the cleaned dataset
data = pd.read_csv('dataset/cleaned_dataset.csv')

In [17]:
# Baseline Recommendation System
def create_baseline_recommendation_system(data):
    """
    Creates a content-based recommendation system using product reviews (Text) and cosine similarity.

    Args:
    data (DataFrame): The dataset containing reviews and metadata.

    Returns:
    callable: A function to recommend products based on the baseline system.
    """
    # TF-IDF Vectorization
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = tfidf.fit_transform(data['Text'])

    # Cosine Similarity Matrix
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Recommendation Function
    def recommend(product_id, top_n=5):
        idx = data.index[data['ProductId'] == product_id].tolist()
        if not idx:
            return "Product ID not found."
        idx = idx[0]

        # Get similarity scores
        sim_scores = list(enumerate(similarity_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get top N recommendations
        top_similar = sim_scores[1:top_n + 1]
        recommended_idx = [i[0] for i in top_similar]
        recommendations = data.iloc[recommended_idx][['ProductId', 'Summary', 'Score']]
        recommendations['SimilarityScore'] = [i[1] for i in top_similar]
        return recommendations

    return recommend

In [18]:
# Improving the Recommendation System
def improve_recommendation_system(data):
    """
    Improves the recommendation system by incorporating sentiment analysis and helpfulness ratio.

    Args:
    data (DataFrame): The dataset containing reviews and metadata.

    Returns:
    callable: A function to recommend products based on the improved system.
    """
    # Sentiment Analysis
    def analyze_sentiment(text):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity  # Sentiment polarity: -1 (negative) to 1 (positive)

    data['SentimentScore'] = data['Text'].apply(analyze_sentiment)

    # Weighted Combined Score
    data['CombinedScore'] = (
        data['Score'] * data['HelpfulnessRatio'] * (data['SentimentScore'] + 1)  
    )

    # TF-IDF Vectorization with Combined Score Weighting
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = tfidf.fit_transform(data['Text'])

    # Adjust TF-IDF values by multiplying with CombinedScore
    weighted_matrix = tfidf_matrix.multiply(data['CombinedScore'].values[:, None])

    # Cosine Similarity Matrix
    similarity_matrix = cosine_similarity(weighted_matrix, weighted_matrix)

    # Recommendation Function
    def recommend(product_id, top_n=5):
        idx = data.index[data['ProductId'] == product_id].tolist()
        if not idx:
            return "Product ID not found."
        idx = idx[0]

        # Get similarity scores
        sim_scores = list(enumerate(similarity_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get top N recommendations
        top_similar = sim_scores[1:top_n + 1]
        recommended_idx = [i[0] for i in top_similar]
        recommendations = data.iloc[recommended_idx][['ProductId', 'Summary', 'Score', 'SentimentScore']]
        recommendations['SimilarityScore'] = [i[1] for i in top_similar]
        return recommendations

    return recommend

In [19]:
# Create Baseline and Improved Systems
baseline_recommend = create_baseline_recommendation_system(data)
improved_recommend = improve_recommendation_system(data)

In [20]:
# Test Recommendations
product_id_to_test = 'B008FHUFAU'
print("Baseline Recommendations:")
print(baseline_recommend(product_id_to_test))

Baseline Recommendations:
        ProductId                              Summary  Score  SimilarityScore
11848  B000FDKQCY                                Bread      3         0.342343
10323  B001E50X84   Glad I ordered the sunmaid raisins      5         0.305033
1849   B0007OVVZ2                       Favorite candy      4         0.279637
3830   B000EM8UGA                        Ok for lipton      3         0.268415
1618   B001ONPMN2  Chocolate and dates didn't mix well      3         0.267751


In [21]:
print("\nImproved Recommendations:")
print(improved_recommend(product_id_to_test))


Improved Recommendations:
    ProductId                           Summary  Score  SentimentScore  \
1  B006BXV14E  So the Mini-Wheats were too big?      2        0.159401   
2  B007I7Z3Z0                 Great Taste . . .      5        0.235565   
3  B000FKMNSM     Lifesavers - Pineapple flavor      4        0.500000   
4  B000XEV9YE                     The Best Tea!      5        0.309722   
5  B0041CIR62          Pretty good rice noodles      4        0.232902   

   SimilarityScore  
1              0.0  
2              0.0  
3              0.0  
4              0.0  
5              0.0  


In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from textblob import TextBlob

In [23]:
# Load the cleaned dataset
data = pd.read_csv('dataset/cleaned_dataset.csv')

In [24]:
# Sentiment Analysis
def analyze_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Sentiment polarity: -1 (negative) to 1 (positive)

data['SentimentScore'] = data['Text'].apply(analyze_sentiment)

In [25]:
# Normalize Sentiment and Helpfulness Ratios
scaler = MinMaxScaler()
data[['HelpfulnessRatio', 'SentimentScore']] = scaler.fit_transform(data[['HelpfulnessRatio', 'SentimentScore']])

In [26]:
# Weighted Combined Score
data['CombinedScore'] = (
    0.5 * data['Score'] +  # Weight for Score
    0.3 * data['HelpfulnessRatio'] +  # Weight for Helpfulness
    0.2 * data['SentimentScore']  # Weight for Sentiment
)

In [27]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(data['Text'])

In [28]:
# Adjust TF-IDF values using Combined Score
weighted_matrix = tfidf_matrix.multiply(data['CombinedScore'].values[:, None])

In [29]:
# Cosine Similarity Matrix
similarity_matrix = cosine_similarity(weighted_matrix, weighted_matrix)

In [30]:
# Improved Recommendation Function
def recommend(product_id, top_n=5):
    idx = data.index[data['ProductId'] == product_id].tolist()
    if not idx:
        return "Product ID not found."
    idx = idx[0]

    # Get similarity scores
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top N recommendations
    top_similar = sim_scores[1:top_n + 1]
    recommended_idx = [i[0] for i in top_similar]
    recommendations = data.iloc[recommended_idx][['ProductId', 'Summary', 'Score', 'SentimentScore', 'HelpfulnessRatio']]
    recommendations['SimilarityScore'] = [i[1] for i in top_similar]
    return recommendations

In [31]:
# Example: Improved Recommendations
product_id_to_test = 'B008FHUFAU'
recommendations = recommend(product_id_to_test, top_n=5)
print("Improved Recommendations:")
print(recommendations)

Improved Recommendations:
        ProductId                              Summary  Score  SentimentScore  \
11848  B000FDKQCY                                Bread      3        0.287500   
10323  B001E50X84   Glad I ordered the sunmaid raisins      5        0.537500   
1849   B0007OVVZ2                       Favorite candy      4        0.556845   
3830   B000EM8UGA                        Ok for lipton      3        0.638889   
1618   B001ONPMN2  Chocolate and dates didn't mix well      3        0.650000   

       HelpfulnessRatio  SimilarityScore  
11848          0.047619         0.342343  
10323          0.190476         0.305033  
1849           0.333333         0.279637  
3830           0.166667         0.268415  
1618           0.166667         0.267751  
