In [4]:
!pip install flask
!pip install pyngrok
!pip install transformers
!pip install torch




In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
import re
import random
from sklearn.model_selection import train_test_split
from flask import Flask, request, jsonify

# Set the seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)


In [6]:
def extract_reviews(file_path):
    reviews = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

        # Remove XML tags and extract review text
        review_texts = re.findall(r'<review_text>(.*?)</review_text>', content, re.DOTALL)
        reviews.extend(review_texts)

    return reviews

# Load positive and negative reviews
positive_reviews = extract_reviews('/content/positive.review')
negative_reviews = extract_reviews('/content/negative.review')

# Example of how to print the first few reviews
print("Positive Reviews (first 2):", positive_reviews[:2])
print("Negative Reviews (first 2):", negative_reviews[:2])


Positive Reviews (first 2): ['\nSphere by Michael Crichton is an excellant novel. This was certainly the hardest to put down of all of the Crichton novels that I have read. \n\nThe story revolves around a man named Norman Johnson. Johnson is a phycologist. He travels with 4 other civilans to a remote location in the Pacific Ocean to help the Navy in a top secret misssion. They quickly learn that under the ocean is a half mile long spaceship. The civilans travel to a center 1000 feet under the ocean to live while researching the spacecraft. They are joined by 5 Navy personel to help them run operations. However on the surface a typhoon comes and the support ships on the surface must leave. The team of ten is stuck 1000 feet under the surface of the ocean. After a day under the sea they find out that the spacecraft is actually an American ship that has explored black holes and has brought back some strange things back to earth.\n\nThis novel does not have the research that some of the ot

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def prepare_data(positive_reviews, negative_reviews, tokenizer, max_len):
    texts = positive_reviews + negative_reviews
    labels = [1] * len(positive_reviews) + [0] * len(negative_reviews)

    train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
    train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

    train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, max_len)
    val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, max_len)
    test_dataset = SentimentDataset(test_texts, test_labels, tokenizer, max_len)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    return train_loader, val_loader, test_loader

# Prepare data loaders
train_loader, val_loader, test_loader = prepare_data(positive_reviews, negative_reviews, tokenizer, max_len)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 10  # 10 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs=10):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            running_loss += loss.item()

        val_loss = evaluate_model(model, val_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss/len(train_loader)}, Validation Loss: {val_loss}')

def evaluate_model(model, val_loader):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
    return val_loss / len(val_loader)

def test_model(model, test_loader):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Train and test the model
train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs=5)
test_model(model, test_loader)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5: 100%|██████████| 90/90 [00:35<00:00,  2.52it/s]


Epoch 1/5, Training Loss: 0.5112735390663147, Validation Loss: 0.27416943311691283


Epoch 2/5: 100%|██████████| 90/90 [00:35<00:00,  2.56it/s]


Epoch 2/5, Training Loss: 0.22309012276430926, Validation Loss: 0.24696734845638274


Epoch 3/5: 100%|██████████| 90/90 [00:36<00:00,  2.44it/s]


Epoch 3/5, Training Loss: 0.09390129529767566, Validation Loss: 0.30287876799702645


Epoch 4/5: 100%|██████████| 90/90 [00:37<00:00,  2.41it/s]


Epoch 4/5, Training Loss: 0.048843416447440786, Validation Loss: 0.3123351659625769


Epoch 5/5: 100%|██████████| 90/90 [00:37<00:00,  2.40it/s]


Epoch 5/5, Training Loss: 0.03877392812218103, Validation Loss: 0.3649290405213833
Test Accuracy: 86.25%


In [13]:

def predict_sentiment(review_text, model, tokenizer, max_len):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    model.eval()

    encoding = tokenizer.encode_plus(
        review_text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    with torch.no_grad():
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_class = torch.argmax(outputs.logits, dim=1).item()

    return "Positive" if predicted_class == 1 else "Negative"

# Example usage:
review1 = "This movie was absolutely terrible. The acting was wooden, the plot was nonsensical, and the special effects were laughable."
review2 = "I loved this film! It was heartwarming, funny, and kept me on the edge of my seat the whole time."

prediction1 = predict_sentiment(review1, model, tokenizer, max_len)
prediction2 = predict_sentiment(review2, model, tokenizer, max_len)

print(f"Review 1 Sentiment: {prediction1}")
print(f"Review 2 Sentiment: {prediction2}")


Review 1 Sentiment: Negative
Review 2 Sentiment: Positive


In [14]:
# Save the fine-tuned BERT model
model_save_path = '/content/model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


('/content/model/tokenizer_config.json',
 '/content/model/special_tokens_map.json',
 '/content/model/vocab.txt',
 '/content/model/added_tokens.json')

In [15]:
from flask import Flask, request, render_template_string
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from pyngrok import ngrok

app = Flask(__name__)

# Load the saved model and tokenizer
model_path = '/content/model'  # Update with your actual path
loaded_model = BertForSequenceClassification.from_pretrained(model_path)
loaded_tokenizer = BertTokenizer.from_pretrained(model_path)

def predict_sentiment(text):
    inputs = loaded_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = loaded_model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1)
    predicted_class = torch.argmax(probs, dim=1).item()
    return "Positive" if predicted_class == 1 else "Negative"

@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        review_text = request.form['review']
        sentiment = predict_sentiment(review_text)
        return render_template_string('''
            <h1>Sentiment Analysis</h1>
            <form method="POST" action="/">
                <textarea name="review" placeholder="Enter a review..."></textarea><br>
                <button type="submit">Analyze Sentiment</button>
            </form>
            <h2>Review:</h2>
            <p>{{ review }}</p>
            <h2>Sentiment:</h2>
            <p>{{ sentiment }}</p>
        ''', review=review_text, sentiment=sentiment)
    return render_template_string('''
        <h1>Sentiment Analysis</h1>
        <form method="POST" action="/">
            <textarea name="review" placeholder="Enter a review..."></textarea><br>
            <button type="submit">Analyze Sentiment</button>
        </form>
    ''')


ngrok.set_auth_token('2kqC7WyPG8YqesaSklIKBODOvh1_6PaYUrMzv6UsK4ietUqk')  # Optional: if you have an ngrok account
public_url = ngrok.connect(5000)
print('Public URL:', public_url)
app.run(port=5000)


Public URL: NgrokTunnel: "https://5d16-34-143-190-182.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [18/Aug/2024 17:43:38] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2024 17:43:39] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2024 17:43:45] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2024 17:43:52] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2024 17:44:11] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2024 17:44:30] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2024 17:44:50] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2024 17:45:42] "POST / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2024 17:45:49] "POST / HTTP/1.1" 200 -
