In [None]:
import pandas as pd

# Read the CSV file, skipping problematic lines
try:
    df = pd.read_csv('rotten_tomatoes_critic_reviews.csv', header=0)
except pd.errors.ParserError as e:
    print(f"Error parsing CSV file: {e}")

# Display basic statistics and columns
print(df.describe())
print(df.columns)

# Extract relevant columns
data = df[['review_type', 'review_content']]
print(data.head())

# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)

# Drop rows with missing values
dataCleaned = data.dropna()
print(dataCleaned.head())

# Check for missing values after cleaning
missing_values = dataCleaned.isnull().sum()
print("Missing Values after cleaning:")
print(missing_values)


                     rotten_tomatoes_link   critic_name top_critic  \
count                             1130017       1111488    1130017   
unique                              17712         11108          2   
top     m/star_wars_the_rise_of_skywalker  Emanuel Levy      False   
freq                                  992          8173     841481   

        publisher_name review_type review_score review_date  \
count          1130017     1130017       824081     1130017   
unique            2230           2          814        8015   
top     New York Times       Fresh          3/5  2000-01-01   
freq             13293      720210        90273       48019   

                 review_content  
count                   1064211  
unique                   949181  
top     Parental Content Review  
freq                        267  
Index(['rotten_tomatoes_link', 'critic_name', 'top_critic', 'publisher_name',
       'review_type', 'review_score', 'review_date', 'review_content'],
      dtype='

In [None]:
dataCleaned.head()

Unnamed: 0,review_type,review_content
0,Fresh,A fantasy adventure that fuses Greek mythology...
1,Fresh,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,Fresh,With a top-notch cast and dazzling special eff...
3,Fresh,Whether audiences will get behind The Lightnin...
4,Rotten,What's really lacking in The Lightning Thief i...


In [None]:
# Import LabelEncoder from scikit-learn
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and transform labels
dataCleaned['label_encoded'] = label_encoder.fit_transform(dataCleaned['review_type'])

# Check the mapping of original labels to encoded values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)

# Split data into features (x) and labels (y)
x = dataCleaned['review_content']
y = dataCleaned['label_encoded']

Label Mapping: {'Fresh': 0, 'Rotten': 1}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataCleaned['label_encoded'] = label_encoder.fit_transform(dataCleaned['review_type'])


In [None]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_data(texts, labels):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_text = tokenizer.encode_plus(text, add_special_tokens=True, max_length=242, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='tf')
        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)
    labels = tf.convert_to_tensor(labels, dtype=tf.int32)
    return input_ids, attention_masks, labels

input_ids, attention_masks, labels = preprocess_data(x, y)

# Split data into train and validation sets
train_indices, val_indices = train_test_split(range(len(input_ids)), test_size=0.2, random_state=42)
train_indices = np.array(train_indices)
val_indices = np.array(val_indices)
train_indices_tf = tf.constant(train_indices)
val_indices_tf = tf.constant(val_indices)

# Use the indices to split the input_ids, attention_masks, and labels
train_input_ids = tf.gather(input_ids, train_indices_tf)
val_input_ids = tf.gather(input_ids, val_indices_tf)
train_attention_masks = tf.gather(attention_masks, train_indices_tf)
val_attention_masks = tf.gather(attention_masks, val_indices_tf)
train_labels = tf.gather(labels, train_indices_tf)
val_labels = tf.gather(labels, val_indices_tf)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
from transformers import TFBertForSequenceClassification
# Load Pretrained BERT Model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


In [None]:
# Fine-Tune BERT on your Dataset
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

history = model.fit(
    [train_input_ids, train_attention_masks],
    train_labels,
    validation_data=([val_input_ids, val_attention_masks], val_labels),
    epochs=3,
    batch_size=32
)


In [None]:

# Evaluate the Model
loss, accuracy = model.evaluate([val_input_ids, val_attention_masks], val_labels)
print("Validation Accuracy:", accuracy)


In [None]:

# Make Predictions
predictions = model.predict([val_input_ids, val_attention_masks])
predicted_labels = np.argmax(predictions.logits, axis=1)

# Confusion Matrix and Classification Report
conf_matrix = confusion_matrix(val_labels, predicted_labels)
class_report = classification_report(val_labels, predicted_labels)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

In [None]:
data_num

In [None]:
!pip install transformers scikit-learn tensorflow

In [None]:
# 1. Preprocess the Data

from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text and convert labels to numerical format
def preprocess_data(texts, labels):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_text = tokenizer.encode_plus(text, add_special_tokens=True, max_length=242, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='tf')
        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])
    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)

    return input_ids, attention_masks, labels


In [None]:
input_ids, attention_masks, labels = preprocess_data(data_num['review_content'], data_num['label_encoded'])

In [None]:
# Split data into train and validation sets
train_indices, val_indices = train_test_split(range(len(input_ids)), test_size=0.2, random_state=42)

In [None]:
import numpy as np
# Convert the indices to NumPy arrays
train_indices = np.array(train_indices)
val_indices = np.array(val_indices)

In [None]:
import tensorflow as tf

# Convert NumPy arrays to TensorFlow tensors
train_indices = tf.convert_to_tensor(train_indices, dtype=tf.int32)
val_indices = tf.convert_to_tensor(val_indices, dtype=tf.int32)​

In [None]:
# Use the indices to split the input_ids, attention_masks, and labels
train_input_ids = tf.gather(input_ids, train_indices)
val_input_ids = tf.gather(input_ids, val_indices)

train_attention_masks = tf.gather(attention_masks, train_indices)
val_attention_masks = tf.gather(attention_masks, val_indices)

train_labels = tf.gather(labels, train_indices)
val_labels = tf.gather(labels, val_indices)

In [None]:
# Convert the indices to TensorFlow tensors
train_indices_tf = tf.constant(train_indices)
val_indices_tf = tf.constant(val_indices)

In [None]:
# Use the indices to split the input_ids, attention_masks, and labels
train_input_ids = tf.gather(input_ids, train_indices_tf)
val_input_ids = tf.gather(input_ids, val_indices_tf)
train_attention_masks = tf.gather(attention_masks, train_indices_tf)
val_attention_masks = tf.gather(attention_masks, val_indices_tf)
train_labels = tf.gather(labels, train_indices_tf)
val_labels = tf.gather(labels, val_indices_tf)


In [None]:
# 2. Load Pretrained BERT Model
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')


In [None]:
# 3. Fine-Tune BERT on your Dataset
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

history = model.fit(
    [train_input_ids, train_attention_masks],
    train_labels,
    validation_data=([val_input_ids, val_attention_masks], val_labels),
    epochs=3,
    batch_size=32
)


In [None]:
dataCleaned['review_type']

In [None]:
# 4. Evaluate the Model
loss, accuracy = model.evaluate([val_input_ids, val_attention_masks], val_labels)
print("Validation Accuracy:", accuracy)


In [None]:
# 5. Make Predictions
test_input_ids, test_attention_masks, test_labels = preprocess_data(x_test, y_test)
predictions = model.predict([test_input_ids, test_attention_masks])