In [None]:
import tensorflow_datasets as tfds
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import string

# Download stopwords for text cleaning
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import tensorflow_datasets as tfds
import pandas as pd

# Load the IMDB dataset
imdb_data, info = tfds.load('imdb_reviews', split=['train', 'test'], as_supervised=True, with_info=True)

# Convert the data to a Pandas DataFrame
def convert_to_dataframe(tf_dataset):
    texts = []
    labels = []
    for text, label in tf_dataset:
        texts.append(str(text.numpy()))  # Convert TensorFlow Tensor to string
        labels.append(int(label.numpy()))  # Convert label Tensor to integer
    return pd.DataFrame({'review': texts, 'sentiment': labels})

# Convert train and test datasets to DataFrames
train_df = convert_to_dataframe(imdb_data[0])
test_df = convert_to_dataframe(imdb_data[1])

# Preview the data to make sure it's correctly loaded
print(train_df.head())


Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.F30FAX_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.F30FAX_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.F30FAX_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.
                                              review  sentiment
0  b"This was an absolutely terrible movie. Don't...          0
1  b'I have been known to fall asleep during film...          0
2  b'Mann photographs the Alberta Rocky Mountains...          0
3  b'This is the kind of film for a snowy Sunday ...          1
4  b'As others have mentioned, all the women that...          1


In [None]:
# Preprocess the text: remove punctuation, stopwords, etc.
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Join the words back into a single string
    return ' '.join(words)

# Apply preprocessing to the reviews
train_df['cleaned_review'] = train_df['review'].apply(preprocess_text)
test_df['cleaned_review'] = test_df['review'].apply(preprocess_text)

# Preview the cleaned data
print(train_df[['review', 'cleaned_review']].head())


                                              review  \
0  b"This was an absolutely terrible movie. Don't...   
1  b'I have been known to fall asleep during film...   
2  b'Mann photographs the Alberta Rocky Mountains...   
3  b'This is the kind of film for a snowy Sunday ...   
4  b'As others have mentioned, all the women that...   

                                      cleaned_review  
0  bthis absolutely terrible movie dont lured chr...  
1  bi known fall asleep films usually due combina...  
2  bmann photographs alberta rocky mountains supe...  
3  bthis kind film snowy sunday afternoon rest wo...  
4  bas others mentioned women go nude film mostly...  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Limit the number of features to 5000
X_train_tfidf = vectorizer.fit_transform(train_df['cleaned_review'])
X_test_tfidf = vectorizer.transform(test_df['cleaned_review'])

# Display the shape of the TF-IDF matrices
print(f"Training data shape: {X_train_tfidf.shape}")
print(f"Test data shape: {X_test_tfidf.shape}")


Training data shape: (25000, 5000)
Test data shape: (25000, 5000)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=200)

# Train the model on the training data
model.fit(X_train_tfidf, train_df['sentiment'])

# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model's performance
accuracy = accuracy_score(test_df['sentiment'], y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

# Print a detailed classification report
print(classification_report(test_df['sentiment'], y_pred))


Test Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.89      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, train_df['sentiment'])

# Make predictions using SVM
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluate SVM performance
accuracy_svm = accuracy_score(test_df['sentiment'], y_pred_svm)
print(f"SVM Test Accuracy: {accuracy_svm:.2f}")

# Detailed classification report for SVM
print("SVM Classification Report:")
print(classification_report(test_df['sentiment'], y_pred_svm))

SVM Test Accuracy: 0.88
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

