In [2]:
# Import necessary libraries for NLP and data processing
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sujith\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sujith\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sujith\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:

df = pd.read_csv('IMDB Dataset.csv')
df=df.iloc[:10000]


In [18]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [19]:

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(f'[{string.punctuation}]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove HTML elements
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove non-alphabetic characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove words containing numbers
    text = re.sub(r'\b\w*\d+\w*\b', '', text)
    # Remove single characters
    text = re.sub(r'\b\w\b', '', text)
    # Remove extra whitespace again after cleaning
    text = re.sub(r'\s+', ' ', text).strip()
   
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# Apply preprocessing to text column
# Replace 'text_column' with your actual column name containing text
df['processed_text'] = df['review'].apply(preprocess_text)

# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nFirst few rows of processed text:")
print(df['processed_text'].head())

Dataset shape: (10000, 3)

First few rows of processed text:
0    [one, reviewer, mentioned, watching, oz, episo...
1    [wonderful, little, production, br, br, filmin...
2    [thought, wonderful, way, spend, time, hot, su...
3    [basically, family, little, boy, jake, think, ...
4    [petter, mattei, love, time, money, visually, ...
Name: processed_text, dtype: object


In [20]:
# Display the first 3 reviews with their sentiment
for i in range(3):
    print(f"\nReview {i+1}:")
    print("-" * 50)
    print(f"Sentiment: {df['sentiment'].iloc[i]}")
    print(f"Original text:\n{df['review'].iloc[i]}")
    print(f"Processed tokens:\n{df['processed_text'].iloc[i]}")
    print("-" * 50)



Review 1:
--------------------------------------------------
Sentiment: positive
Original text:
One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady ag

In [21]:
df.head()

Unnamed: 0,review,sentiment,processed_text
0,One of the other reviewers has mentioned that ...,positive,"[one, reviewer, mentioned, watching, oz, episo..."
1,A wonderful little production. <br /><br />The...,positive,"[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,"[basically, family, little, boy, jake, think, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, time, money, visually, ..."


In [22]:
# Create separate y array for sentiment labels
y = df['sentiment'].map({'positive': 1, 'negative': 0}).values

# Drop the review and sentiment columns from dataframe
df = df.drop(['review', 'sentiment'], axis=1)

# Display the updated dataframe and y array
print("Updated dataframe shape:", df.shape)
print("y array shape:", y.shape)
print("\nFirst few rows of the updated dataframe:")
print(df.head())
print("\nFirst few elements of y array:")
print(y[:5])


Updated dataframe shape: (10000, 1)
y array shape: (10000,)

First few rows of the updated dataframe:
                                      processed_text
0  [one, reviewer, mentioned, watching, oz, episo...
1  [wonderful, little, production, br, br, filmin...
2  [thought, wonderful, way, spend, time, hot, su...
3  [basically, family, little, boy, jake, think, ...
4  [petter, mattei, love, time, money, visually, ...

First few elements of y array:
[1 1 1 0 1]


In [23]:
df.head()

Unnamed: 0,processed_text
0,"[one, reviewer, mentioned, watching, oz, episo..."
1,"[wonderful, little, production, br, br, filmin..."
2,"[thought, wonderful, way, spend, time, hot, su..."
3,"[basically, family, little, boy, jake, think, ..."
4,"[petter, mattei, love, time, money, visually, ..."


In [24]:
# Create vocabulary from all documents
vocabulary = set()
for doc in df['processed_text']:
    vocabulary.update(doc)

# Convert to sorted list for consistent ordering
vocabulary = sorted(list(vocabulary))

# Print vocabulary statistics
print(f"Vocabulary size: {len(vocabulary)}")
print("\nFirst 20 words in vocabulary:")
print(vocabulary[:20])
print("\nLast 20 words in vocabulary:")
print(vocabulary[-20:])


Vocabulary size: 46273

First 20 words in vocabulary:
['aa', 'aaa', 'aaaaahhhh', 'aaaarrgh', 'aaah', 'aaall', 'aaargh', 'aaaugh', 'aag', 'aage', 'aaghh', 'aahed', 'aaip', 'aak', 'aaliyah', 'aames', 'aamir', 'aamto', 'aankhen', 'aap']

Last 20 words in vocabulary:
['zucco', 'zuccon', 'zucher', 'zucker', 'zucovic', 'zudina', 'zues', 'zukor', 'zula', 'zulu', 'zuniga', 'zurich', 'zurn', 'zwart', 'zwick', 'zz', 'zzzz', 'zzzzip', 'zzzzzzzzzzzz', 'zzzzzzzzzzzzzzzzzz']


In [25]:
# Count the number of positive (1) and negative (0) sentiments
unique, counts = np.unique(y, return_counts=True)
sentiment_counts = dict(zip(unique, counts))

print("Sentiment distribution:")
print(f"Positive (1): {sentiment_counts[1]}")
print(f"Negative (0): {sentiment_counts[0]}")


Sentiment distribution:
Positive (1): 5028
Negative (0): 4972


In [26]:
# Create a list to store word frequencies for each document
document_word_frequencies = []

# Iterate through each document in df['processed_text']
for doc in df['processed_text']:
    # Create a dictionary to store word frequencies for current document
    word_freq = {}
    
    # Count frequency of each word in the document
    for word in doc:
        word_freq[word] = word_freq.get(word, 0) + 1
    
    # Append the word frequencies dictionary to our list
    document_word_frequencies.append(word_freq)

# Convert the list of dictionaries to a DataFrame
word_freq_df = pd.DataFrame(document_word_frequencies)

# Fill NaN values with 0 (words that don't appear in a document)
word_freq_df = word_freq_df.fillna(0)

# Display the shape and first few rows of the word frequency DataFrame
print("Word frequency DataFrame shape:", word_freq_df.shape)
print("\nFirst few rows of word frequencies:")
print(word_freq_df.head())


Word frequency DataFrame shape: (10000, 46273)

First few rows of word frequencies:
   one  reviewer  mentioned  watching   oz  episode  hooked  right  exactly  \
0  1.0       1.0        1.0       2.0  6.0      2.0     1.0    2.0      1.0   
1  1.0       0.0        0.0       1.0  0.0      0.0     0.0    0.0      0.0   
2  1.0       0.0        0.0       1.0  0.0      0.0     0.0    1.0      0.0   
3  0.0       0.0        0.0       0.0  0.0      0.0     0.0    0.0      0.0   
4  6.0       0.0        0.0       0.0  0.0      0.0     0.0    0.0      1.0   

   happened  ...  bitty  cheapjack  webbed  victimhood  seppaku  rakkie  \
0       1.0  ...    0.0        0.0     0.0         0.0      0.0     0.0   
1       0.0  ...    0.0        0.0     0.0         0.0      0.0     0.0   
2       0.0  ...    0.0        0.0     0.0         0.0      0.0     0.0   
3       0.0  ...    0.0        0.0     0.0         0.0      0.0     0.0   
4       0.0  ...    0.0        0.0     0.0         0.0      0.0   

In [27]:
class logistic_regression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.num_iterations):
            linear_pred = np.dot(X, self.weights) + self.bias
            predictions = self.sigmoid(linear_pred)
            
            dw = (1 / n_samples) * np.dot(X.T, (predictions - y))
            db = (1 / n_samples) * np.sum(predictions - y)
            
            self.weights -= self.learning_rate * dw     
            self.bias -= self.learning_rate * db
            
    def predict(self, X):
        linear_pred = np.dot(X, self.weights) + self.bias
        y_pred = self.sigmoid(linear_pred)
        y_pred_class = [1 if i >= 0.5 else 0 for i in y_pred]
        return y_pred_class
    
    def accuracy(self, y_true, y_pred):
        y_true=list(y_true)
        y_pred=list(y_pred)
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy 
    

In [29]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Convert sentiment to numeric (positive=1, negative=0)


# Split the data with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(word_freq_df, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = logistic_regression(learning_rate=0.01, num_iterations=1000)
model.fit(X_train, y_train)




Model accuracy on test set: 0.8180

Example predictions:


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [32]:
# Make predictions on test set

# Print some example predictions
print("\nExample predictions:")
for i in range(5):
    print(f"True: {y_test[i]}, Predicted: {y_pred[i]}")


Example predictions:
True: 0, Predicted: 0
True: 0, Predicted: 1
True: 0, Predicted: 0
True: 1, Predicted: 1
True: 1, Predicted: 1
