**Dataset Review**

In [1]:
import pandas as pd

df = pd.read_csv("/kaggle/input/imdb-reviews-dataset/IMDB Dataset.csv")
df.shape
df.head(5)
df.drop(columns = ['sentiment'])


Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is..."
...,...
49995,I thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di..."
49997,I am a Catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...


**Preprocessing the dataset for Better Results** 

In [2]:
import pandas as pd
import spacy

# Load dataset
df = pd.read_csv("/kaggle/input/imdb-reviews-dataset/IMDB Dataset.csv")

# Load SpaCy model with unnecessary components disabled
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])  

# Function to preprocess text in chunks
def preprocess_texts(texts):
    cleaned_texts = []
    for doc in nlp.pipe(texts, batch_size=1000, n_process=4):  # Process in parallel
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        cleaned_texts.append(" ".join(tokens))
    return cleaned_texts

# Split dataset into smaller chunks (e.g., 10,000 reviews per batch)
chunk_size = 10000
num_chunks = len(df) // chunk_size + 1

processed_chunks = []
for i in range(num_chunks):
    chunk = df.iloc[i * chunk_size : (i + 1) * chunk_size].copy()
    chunk['cleaned_review'] = preprocess_texts(chunk['review'])
    processed_chunks.append(chunk)
    print(f"✅ Processed chunk {i + 1}/{num_chunks}")

# Combine all processed chunks
df_cleaned = pd.concat(processed_chunks, ignore_index=True)
#print(df_cleaned.head(5))

# Save cleaned dataset
df_cleaned.to_csv("cleaned_IMDB_reviews.csv", index=False)

#print("✅ All chunks processed and saved successfully!")


✅ Processed chunk 1/6
✅ Processed chunk 2/6
✅ Processed chunk 3/6
✅ Processed chunk 4/6
✅ Processed chunk 5/6
✅ Processed chunk 6/6


**Applying Sentiment Analysis**

In [8]:
#installing textblob 

!pip install textblob



In [None]:
import pandas as pd 
import spacy
from textblob import TextBlob

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to determine sentiment
def getSentiment(text):
    doc = nlp(text)  # Process text with spaCy
    sentiment_score = TextBlob(doc.text).sentiment.polarity  # Get polarity score
    if sentiment_score > 0:
        return "positive"
    elif sentiment_score < 0:
        return "negative"
    else:
        return "neutral"

# Process dataset in chunks
chunksize = 10000
chunk_list = []  # Corrected list name

for chunk in pd.read_csv("/kaggle/input/moviereviews-cleaneddata/cleaned_IMDB_reviews.csv", chunksize=chunksize):
    # Drop 'review' column if it exists
    if "review" in chunk.columns:
        chunk = chunk.drop(columns=["review"])
    
    # Apply sentiment function
    chunk["sentiment"] = chunk["cleaned_review"].apply(getSentiment)

    # Append processed chunk
    chunk_list.append(chunk)

# Merge all chunks into a single DataFrame
sentiments_df = pd.concat(chunk_list, ignore_index=True)
sentiments_df.to_csv("/kaggle/working/IMDB_sentiments.csv", index = False)

