# Problem Statement: IMDb Movie Review Sentiment Analysis

In [None]:
import re
import numpy as np
from collections import Counter

In [None]:
# Function to preprocess text data
def preprocess_text(text):
  # todo: Convert text to lowercase
  text = text.lower()
  # Remove special characters and numbers
  text = re.sub(r'[^a-z\s]', '', text)
  return text

In [None]:
# Function to create a Bag of Words representation
def create_bow(corpus):
    word_counts = Counter()
    for doc in corpus:
        words = doc.split()
        word_counts.update(words)
    word_to_index = {word: idx for idx, (word, _) in enumerate(word_counts.items())}
    return word_to_index

In [None]:
# Function to convert text data into Bag of Words features
def text_to_bow(text, word_to_index):
    bow_vector = np.zeros(len(word_to_index))
    words = text.split()
    for word in words:
        if word in word_to_index:
            bow_vector[word_to_index[word]] += 1
    return bow_vector

In [None]:
# Logistic Regression Model
class LogisticRegression:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs

    def sigmoid(self, z):
        # Return sigmoid of z
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        m, n = X.shape
        self.weights = np.zeros(n)
        for epoch in range(self.epochs):
            z = np.dot(X, self.weights)
            predictions = self.sigmoid(z)
            gradient = np.dot(X.T, (predictions - y)) / m
            # Update self.weights corresponding to learning_rate and gradient
            self.weights -= self.learning_rate * gradient

    def predict(self, X):
        z = np.dot(X, self.weights)
        predictions = self.sigmoid(z)
        return np.round(predictions)

In [None]:
%pip install --upgrade kaggle


Collecting kaggle
  Downloading kaggle-1.6.4.tar.gz (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.6/84.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.6.4-py3-none-any.whl size=111926 sha256=2a0de9e8fc440bfd1ca15e546ab80cb2b3d5cf35e95b2cb4523aaba503e5d902
  Stored in directory: /root/.cache/pip/wheels/96/ea/6e/1ef402d5911f0536091cd6d9357682c64214efa960eab358b4
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.16
    Uninstalling kaggle-1.5.16:
      Successfully uninstalled kaggle-1.5.16
Successfully installed kaggle-1.6.4


In [None]:
#kaggle download
from google.colab import files
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

KeyboardInterrupt: 

In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
  0% 0.00/25.7M [00:00<?, ?B/s] 51% 13.0M/25.7M [00:00<00:00, 130MB/s]
100% 25.7M/25.7M [00:00<00:00, 172MB/s]


In [None]:
from zipfile import ZipFile
file_name = "/content/imdb-dataset-of-50k-movie-reviews.zip"
with ZipFile(file_name,'r') as zip:
  zip.extractall()
  print('Done')

Done


In [None]:
# Load the dataset
import pandas as pd
df = pd.read_csv("/content/IMDB Dataset.csv")

# Taking first 22% rows of the dataframe
df = df[:(int(0.22 * len(df)))]

# Preprocess the text data
df['review'] = df['review'].apply(preprocess_text)

# Map 'positive' to 1 and 'negative' to 0
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split the data into training and validation sets (80-20 split)
train_size = int(0.8 * len(df))
train_data, val_data = df[:train_size], df[train_size:]

# Create Bag of Words representation for training set
word_to_index = create_bow(train_data['review'])
X_train = np.array([text_to_bow(text, word_to_index) for text in train_data['review']])
y_train = train_data['sentiment'].values

In [None]:
# Create Bag of Words representation for validation set
X_val = np.array([text_to_bow(text, word_to_index) for text in val_data['review']])
y_val = val_data['sentiment'].values

In [None]:
# Train the logistic regression model
lr_model = LogisticRegression(learning_rate=0.01, epochs=1000)
lr_model.fit(X_train, y_train)

# Evaluate the model on the validation set
predictions = lr_model.predict(X_val)

# Calculate accuracy
accuracy = np.mean(predictions == y_val)
print(f'Accuracy: {accuracy:.2%}')

Accuracy: 82.14%
