In [None]:
!pip install transformers==4.29.0 datasets sentence-transformers scikit-learn pandas numpy -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.9/111.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.0 MB/s[0m eta [36

In [None]:
from transformers import (DistilBertTokenizer, DistilBertForSequenceClassification,
                          Trainer, TrainingArguments)
from datasets import Dataset
import torch, pandas as pd, numpy as np, re, datetime as dt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [None]:
# Upload kaggle.json first (Account ▸ API Token) if not mounted
import os, shutil, subprocess, zipfile, glob
if not os.path.exists('/root/.kaggle/kaggle.json'):
    from google.colab import files
    files.upload()          # upload kaggle.json
    os.makedirs('/root/.kaggle', exist_ok=True)
    shutil.move('kaggle.json', '/root/.kaggle/kaggle.json')
    os.chmod('/root/.kaggle/kaggle.json', 0o600)

!pip install -q kaggle --upgrade
!kaggle datasets download -d snap/amazon-fine-food-reviews -p /content --force
!unzip -q /content/amazon-fine-food-reviews.zip Reviews.csv


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews
License(s): CC0-1.0
Downloading amazon-fine-food-reviews.zip to /content
 89% 215M/242M [00:00<00:00, 461MB/s]
100% 242M/242M [00:00<00:00, 485MB/s]


In [None]:
def load_amazon_reviews(path='Reviews.csv'):
    cols = ["Id","ProductId","UserId","ProfileName",
            "HelpfulnessNumerator","HelpfulnessDenominator",
            "Score","Time","Summary","Text"]
    df = pd.read_csv(path, names=cols, header=0, encoding='utf-8-sig')
    df = df[df['Text'].apply(lambda x: all(ord(c) < 256 for c in str(x)))]
    df = df[df.Score != 3]             # drop neutral 3-star
    df['label'] = (df.Score > 3).astype(int)
    return df

df = load_amazon_reviews('Reviews.csv')
print(f"Dataset shape: {df.shape}")
print("\nSample data:")
print(df[['UserId','Score','Text','HelpfulnessNumerator','HelpfulnessDenominator']].head())


Dataset shape: (525814, 11)

Sample data:
           UserId  Score                                               Text  \
0  A3SGXH7AUHU8GW      5  I have bought several of the Vitality canned d...   
1  A1D87F6ZCVE5NK      1  Product arrived labeled as Jumbo Salted Peanut...   
2   ABXLMWJIXXAIN      4  This is a confection that has been around a fe...   
3  A395BORC6FGVXV      2  If you are looking for the secret ingredient i...   
4  A1UQRSCLF8GW1T      5  Great taffy at a great price.  There was a wid...   

   HelpfulnessNumerator  HelpfulnessDenominator  
0                     1                       1  
1                     0                       0  
2                     1                       1  
3                     3                       3  
4                     0                       0  


In [None]:
def load_amazon_reviews(path='Reviews.csv'):
    cols = ["Id","ProductId","UserId","ProfileName",
            "HelpfulnessNumerator","HelpfulnessDenominator",
            "Score","Time","Summary","Text"]
    df = pd.read_csv(path, names=cols, header=0, encoding='utf-8-sig')
    df = df[df['Text'].apply(lambda x: all(ord(c) < 256 for c in str(x)))]
    df = df[df.Score != 3]             # drop neutral 3-star
    df['label'] = (df.Score > 3).astype(int)
    return df

df = load_amazon_reviews('Reviews.csv')

def train_bert_sentiment(data):
    print("Training BERT sentiment model...")
    tok = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    ds  = Dataset.from_pandas(data[['Text','label']])

    def tokenize_function(examples):
        return tok(examples['Text'], padding='max_length', truncation=True, max_length=128)

    ds_tok = ds.map(tokenize_function, batched=True)
    split = ds_tok.train_test_split(test_size=0.2)

    model = DistilBertForSequenceClassification.from_pretrained(
                'distilbert-base-uncased', num_labels=2)

    args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        report_to='none'
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=split['train'],
        eval_dataset=split['test']
    )

    trainer.train()
    return model, tok, trainer

# Train on a subset for faster execution (remove .head(5000) for full dataset)
sentiment_model, sentiment_tok, trainer = train_bert_sentiment(df.head(5000))

# Evaluate model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

Training BERT sentiment model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.we

Epoch,Training Loss,Validation Loss
1,No log,0.244044
2,0.196900,0.181437
3,0.196900,0.209675


Evaluation Results: {'eval_loss': 0.18143673241138458, 'eval_runtime': 204.3204, 'eval_samples_per_second': 4.894, 'eval_steps_per_second': 0.612, 'epoch': 3.0}


In [None]:
import pandas as pd

In [None]:
import pandas as pd
import ace_tools as tools

# Retrieve metrics from the previous cell
metrics = tools.get_session_variable("metrics")

# Convert the dictionary to a pandas DataFrame
metrics_df = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])

# Display the accuracy table
tools.display_dataframe_to_user(name="Model Evaluation Metrics", dataframe=metrics_df)

# Optionally, print the table for a quick view
print(metrics_df)


                    Metric       Value
0                eval_loss    0.181437
1             eval_runtime  204.320400
2  eval_samples_per_second    4.894000
3    eval_steps_per_second    0.612000
4                    epoch    3.000000


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Save the trained sentiment model
sentiment_model.save_pretrained('/content/drive/MyDrive/amazon_sentiment/sentiment_model2')
sentiment_tok.save_pretrained('/content/drive/MyDrive/amazon_sentiment/sentiment_model2')
print("Sentiment model saved!")


Sentiment model saved!


In [None]:
# Save the trained sentiment model
sentiment_model.save_pretrained('./sentiment_model')
sentiment_tok.save_pretrained('./sentiment_model')
print("Sentiment model saved!")

Sentiment model saved!


In [None]:
class UserTrustScorePredictor:
    def __init__(self):
        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.scaler = StandardScaler()
        self.feature_names = []

    def extract_user_features(self, user_reviews_df):
        """Extract trust-related features for a user"""
        features = {}

        # Review volume and frequency
        features['total_reviews'] = len(user_reviews_df)

        # Rating patterns
        ratings = user_reviews_df['Score'].values
        features['rating_variance'] = np.var(ratings)
        features['extreme_rating_pct'] = np.mean((ratings == 1) | (ratings == 5)) * 100
        features['avg_rating'] = np.mean(ratings)
        features['rating_consistency'] = 1 - (np.std(ratings) / 5.0)

        # Text quality features
        review_lengths = user_reviews_df['Text'].str.len()
        features['avg_review_length'] = review_lengths.mean()
        features['review_length_variance'] = review_lengths.var()

        # Vocabulary diversity
        all_text = ' '.join(user_reviews_df['Text'].astype(str))
        words = re.findall(r'\b\w+\b', all_text.lower())
        features['vocabulary_diversity'] = len(set(words)) / max(len(words), 1)

        # Helpfulness metrics
        if 'HelpfulnessNumerator' in user_reviews_df.columns:
            helpful_votes = user_reviews_df['HelpfulnessNumerator'].sum()
            total_votes = user_reviews_df['HelpfulnessDenominator'].sum()
            features['helpfulness_ratio'] = helpful_votes / max(total_votes, 1)
            features['avg_helpful_votes'] = helpful_votes / features['total_reviews']
        else:
            features['helpfulness_ratio'] = 0
            features['avg_helpful_votes'] = 0

        # First-person pronoun usage (authenticity indicator)
        first_person_pronouns = ['i', 'me', 'my', 'myself', 'we', 'us', 'our']
        total_pronouns = 0
        total_words = 0

        for text in user_reviews_df['Text']:
            if pd.isna(text):
                continue
            words = re.findall(r'\b\w+\b', str(text).lower())
            total_words += len(words)
            total_pronouns += sum(1 for word in words if word in first_person_pronouns)

        features['first_person_usage'] = total_pronouns / max(total_words, 1) * 100

        return features

    def create_trust_labels(self, user_features_df, threshold_percentile=70):
        """Create trust labels based on helpfulness and consistency"""
        trust_score = (
            user_features_df['helpfulness_ratio'] * 0.4 +
            user_features_df['rating_consistency'] * 0.3 +
            user_features_df['vocabulary_diversity'] * 0.2 +
            (user_features_df['first_person_usage'] / 100) * 0.1
        )

        threshold = np.percentile(trust_score, threshold_percentile)
        return (trust_score >= threshold).astype(int)

    def train_trust_model(self, reviews_df):
        """Train the user trust prediction model"""
        print("Extracting user features for trust prediction...")

        user_features_list = []
        user_ids = []

        for user_id, user_reviews in reviews_df.groupby('UserId'):
            if len(user_reviews) >= 3:  # Only users with 3+ reviews
                features = self.extract_user_features(user_reviews)
                user_features_list.append(features)
                user_ids.append(user_id)

        user_features_df = pd.DataFrame(user_features_list, index=user_ids)
        trust_labels = self.create_trust_labels(user_features_df)

        # Prepare features
        X = user_features_df.fillna(user_features_df.mean())
        y = trust_labels

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Scale and train
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        self.model.fit(X_train_scaled, y_train)
        self.feature_names = list(X.columns)

        # Evaluate
        y_pred = self.model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        print(f"Trust prediction accuracy: {accuracy:.3f}")
        print(f"Trained on {len(user_features_list)} users")

        return accuracy

    def predict_user_trust(self, user_reviews_df):
        """Predict trust score for a user (0-1)"""
        features = self.extract_user_features(user_reviews_df)
        feature_df = pd.DataFrame([features])
        feature_df = feature_df.reindex(columns=self.feature_names, fill_value=0)
        features_scaled = self.scaler.transform(feature_df)
        trust_prob = self.model.predict_proba(features_scaled)[0][1]
        return trust_prob


In [None]:
# Train trust prediction model
trust_predictor = UserTrustScorePredictor()
trust_accuracy = trust_predictor.train_trust_model(df.head(5000))  # Use subset for speed

Extracting user features for trust prediction...
Trust prediction accuracy: 1.000
Trained on 26 users


In [None]:
class TrustAwareSentimentAnalysis:
    def __init__(self, sentiment_model, sentiment_tokenizer, trust_predictor):
        self.sentiment_model = sentiment_model
        self.sentiment_tokenizer = sentiment_tokenizer
        self.trust_predictor = trust_predictor
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.sentiment_model.to(self.device)


    def predict_sentiment(self, text):
        """Predict sentiment using BERT model"""
        inputs = self.sentiment_tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=128
        )

        # Move inputs to the same device as the model
        inputs = {name: tensor.to(self.device) for name, tensor in inputs.items()}

        with torch.no_grad():
            outputs = self.sentiment_model(**inputs)
            probabilities = torch.softmax(outputs.logits, dim=1)
            positive_prob = probabilities[0][1].item()

        return positive_prob

    def predict_with_trust(self, review_text, user_reviews_df, trust_weight=0.3):
        """Predict sentiment weighted by user trust score"""
        # Get base sentiment
        base_sentiment = self.predict_sentiment(review_text)

        # Get user trust score
        user_trust = self.trust_predictor.predict_user_trust(user_reviews_df)

        # Weight sentiment by trust
        trust_weighted_sentiment = base_sentiment * (trust_weight * user_trust + (1 - trust_weight))

        return {
            'base_sentiment': base_sentiment,
            'user_trust_score': user_trust,
            'trust_weighted_sentiment': trust_weighted_sentiment,
            'trust_level': 'High' if user_trust > 0.7 else 'Medium' if user_trust > 0.4 else 'Low'
        }

# Initialize trust-aware sentiment analyzer
trust_aware_analyzer = TrustAwareSentimentAnalysis(
    sentiment_model,
    sentiment_tok,
    trust_predictor
)

In [None]:
# Demo with sample users
print("=== Trust-Aware Sentiment Analysis Demo ===\n")

# Get a few sample users with multiple reviews
sample_users = df.groupby('UserId').filter(lambda x: len(x) >= 5).groupby('UserId').head(1)

for idx, (_, sample_review) in enumerate(sample_users.head(3).iterrows()):
    user_id = sample_review['UserId']
    user_reviews = df[df['UserId'] == user_id]

    result = trust_aware_analyzer.predict_with_trust(
        sample_review['Text'],
        user_reviews
    )

    print(f"User {idx+1} (ID: {user_id}):")
    print(f"Reviews: {len(user_reviews)}")
    print(f"Review text: {sample_review['Text'][:100]}...")
    print(f"Actual rating: {sample_review['Score']}/5")
    print(f"Base sentiment: {result['base_sentiment']:.3f}")
    print(f"User trust score: {result['user_trust_score']:.3f} ({result['trust_level']})")
    print(f"Trust-weighted sentiment: {result['trust_weighted_sentiment']:.3f}")
    print("-" * 60)


=== Trust-Aware Sentiment Analysis Demo ===

User 1 (ID: A2725IB4YY9JEB):
Reviews: 12
Review text: One of my boys needed to lose some weight and the other didn't.  I put this food on the floor for th...
Actual rating: 5/5
Base sentiment: 0.992
User trust score: 0.360 (Low)
Trust-weighted sentiment: 0.802
------------------------------------------------------------
User 2 (ID: A18ECVX2RJ7HUE):
Reviews: 18
Review text: good flavor! these came securely packed... they were fresh and delicious! i love these Twizzlers!...
Actual rating: 4/5
Base sentiment: 0.997
User trust score: 0.090 (Low)
Trust-weighted sentiment: 0.725
------------------------------------------------------------
User 3 (ID: A2MUGFV2TDQ47K):
Reviews: 161
Review text: The Strawberry Twizzlers are my guilty pleasure - yummy. Six pounds will be around for a while with ...
Actual rating: 5/5
Base sentiment: 0.996
User trust score: 0.120 (Low)
Trust-weighted sentiment: 0.733
----------------------------------------------------

In [None]:
def process_reviews_with_trust(reviews_df, trust_aware_analyzer, sample_size=100):
    """Process multiple reviews with trust weighting"""
    results = []

    # Sample reviews for processing
    sample_reviews = reviews_df.sample(n=min(sample_size, len(reviews_df)))

    for _, review in sample_reviews.iterrows():
        user_reviews = reviews_df[reviews_df['UserId'] == review['UserId']]

        if len(user_reviews) >= 3:  # Only process users with sufficient history
            result = trust_aware_analyzer.predict_with_trust(
                review['Text'],
                user_reviews
            )

            results.append({
                'user_id': review['UserId'],
                'review_id': review['Id'],
                'actual_rating': review['Score'],
                'base_sentiment': result['base_sentiment'],
                'user_trust_score': result['user_trust_score'],
                'trust_weighted_sentiment': result['trust_weighted_sentiment'],
                'trust_level': result['trust_level']
            })

    return pd.DataFrame(results)

# Process sample reviews
results_df = process_reviews_with_trust(df, trust_aware_analyzer, sample_size=50)

# Display statistics
print("=== Trust-Weighted Sentiment Analysis Results ===")
print(f"Processed {len(results_df)} reviews")
print(f"\nTrust Score Distribution:")
print(results_df['trust_level'].value_counts())
print(f"\nAverage Trust Score: {results_df['user_trust_score'].mean():.3f}")
print(f"Average Base Sentiment: {results_df['base_sentiment'].mean():.3f}")
print(f"Average Trust-Weighted Sentiment: {results_df['trust_weighted_sentiment'].mean():.3f}")

# Show sample results
print(f"\n=== Sample Results ===")
print(results_df[['user_id', 'actual_rating', 'base_sentiment', 'user_trust_score', 'trust_weighted_sentiment', 'trust_level']].head(10))


=== Trust-Weighted Sentiment Analysis Results ===
Processed 29 reviews

Trust Score Distribution:
trust_level
Low    29
Name: count, dtype: int64

Average Trust Score: 0.162
Average Base Sentiment: 0.922
Average Trust-Weighted Sentiment: 0.689

=== Sample Results ===
          user_id  actual_rating  base_sentiment  user_trust_score  \
0  A1FBHGNW57A0HS              4        0.997216              0.05   
1   APJUOLME891QY              5        0.997248              0.30   
2  A1MZEMD0EDETS6              5        0.995913              0.07   
3  A1E5N6AYSBYARO              5        0.997318              0.08   
4   A7PPSGWM6385B              5        0.996980              0.14   
5  A1BAJ6JXU8PSZH              2        0.443064              0.17   
6  A1F14BB4PV053A              5        0.996980              0.30   
7   A3K3JYARN27M4              5        0.997206              0.30   
8  A3JHC8O59WDHFZ              2        0.975977              0.12   
9  A1RK93WQZKQ79X              4

In [None]:
# Save both models for future use
import pickle

# Save trust predictor
with open('trust_predictor.pkl', 'wb') as f:
    pickle.dump(trust_predictor, f)

# Save combined analyzer
with open('trust_aware_analyzer.pkl', 'wb') as f:
    pickle.dump(trust_aware_analyzer, f)

print("All models saved successfully!")
print("Files created:")
print("- sentiment_model/ (BERT sentiment model)")
print("- trust_predictor.pkl (Trust prediction model)")
print("- trust_aware_analyzer.pkl (Combined analyzer)")


All models saved successfully!
Files created:
- sentiment_model/ (BERT sentiment model)
- trust_predictor.pkl (Trust prediction model)
- trust_aware_analyzer.pkl (Combined analyzer)


In [None]:
# Assuming you have the 'UserTrustScorePredictor' class and an instance 'trust_predictor' already trained

# Step 1: Accept user input for a review
user_review = input("Enter your review: ")

# Step 2: Convert the user input into a DataFrame
import pandas as pd
user_reviews_df = pd.DataFrame({"UserId": [1], "Score": [5], "Text": [user_review]})

# Step 3: Predict the user's trust score
trust_score = trust_predictor.predict_user_trust(user_reviews_df)

# Step 4: Display the predicted trust score
print(f"Predicted Trust Score: {trust_score:.2f}")


Enter your review: very bad food with guilty environment.
Predicted Trust Score: 0.30


In [None]:
!pip install tensorflow==2.12.0 keras==2.12.0



In [None]:
from transformers import pipeline
import pandas as pd

# Assuming you have an instance of 'UserTrustScorePredictor' named 'trust_predictor'
# Load pre-trained sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis")

# Step 1: Accept user input
user_reviews = input("Enter your review(s): ")

# Step 2: Convert user input into a DataFrame
user_reviews_df = pd.DataFrame({
    "UserId": [1],  # Example user ID
    "Score": [5],  # Example score (can vary for different reviews)
    "Text": [user_reviews]  # The review text from user input
})

# Step 3: Predict the trust score
trust_score = trust_predictor.predict_user_trust(user_reviews_df)

# Step 4: Predict the sentiment
sentiment = sentiment_analyzer(user_reviews)[0]  # Get the first result

# Step 5: Output the results
print("\n--- Combined Analysis ---")
print(f"Review: {user_reviews}")
print(f"Sentiment: {sentiment['label']} (Confidence: {sentiment['score']:.2f})")
print(f"Predicted Trust Score: {trust_score:.2f}")



Failed to import TF-Keras. Please note that TF-Keras is not installed by default when you install TensorFlow Probability. This is so that JAX-only users do not have to install TensorFlow or TF-Keras. To use TensorFlow Probability with TensorFlow, please install the tf-keras or tf-keras-nightly package.
This can be be done through installing the tensorflow-probability[tf] extra.




RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
cannot import name 'Tensor' from 'tensorflow.python.framework.ops' (/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/ops.py)

In [None]:
jupyter nbconvert --clear-output sentiment_trust_complete_v2.ipynb --inplace
