# Importing Libraries

In [1]:
import nltk
import spacy
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#Reading the CSV file

df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
#checking for null data

df.isna().sum()

review       0
sentiment    0
dtype: int64

In [None]:
print(df['review'].value_counts())
print(df['review'].value_counts())

In [8]:
# Initializing objects for data pre-processing

lemma = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm')

In [11]:
def process(text):
    """
    pre-processing techniques for the text 
    Parameters:
    
    text (str): Input text for sentiment prediction
    
    str: pre-processed text
    """
    doc = nlp(text)
    refined_text = [token.lemma_ for token in doc if token.lemma_ not in set(stopwords.words('english')) and not token.is_punct and not token.is_space ]
    return ' '.join(token for token in refined_text )

In [12]:
# test the process() function

text = """The eager student studied programming diligently. Students in the advanced programming class were studying various programming languages, while other students focused on data analysis. The dedicated professor taught programming concepts enthusiastically. She teaches both basic and advanced topics, making learning enjoyable for all students. The university library provided excellent study resources, and many students studied there daily. Programming assignments challenged students greatly, yet dedicated studying helped them succeed. The professor's teaching style made complex concepts simpler. Some students preferred morning study sessions, while others studied better at night. The university's programming department organized extra study groups, where advanced students helped beginners learn programming basics. Teaching assistants provided additional support, teaching small groups effectively. The library's quiet environment enhanced studying efficiency. Students who studied consistently showed better results than students who only studied occasionally. The programming projects required dedicated effort, but the professor's teaching approach made learning manageable. Several students achieved excellent results through diligent studying and programming practice. The university's teaching methods proved successful as students mastered programming concepts progressively. Daily studying and programming exercises strengthened student understanding significantly."""
txt = process(text)

In [13]:
print(txt)

eager student study programming diligently student advanced programming class study various programming language student focus data analysis dedicated professor teach programming concept enthusiastically teach basic advanced topic make learn enjoyable student university library provide excellent study resource many student study daily programming assignment challenge student greatly yet dedicate studying help succeed professor 's teaching style make complex concept simple student prefer morning study session study well night university 's programming department organize extra study group advanced student help beginner learn program basic teach assistant provide additional support teach small group effectively library 's quiet environment enhance study efficiency student study consistently show well result student study occasionally programming project require dedicated effort professor 's teaching approach make learn manageable several student achieve excellent result diligent study pr

In [14]:
# mapping the positive and negative sentiments

df['sentiment'] = df['sentiment'].map({
    'positive' : 1,
    'negative' : 0
})
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [15]:
df['sentiment'].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [20]:
# applying the pre-processing techniques 

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

#apply our preprocessing techniques to the review column before BOW

df['review_pre'] = df['review'].apply(process)

In [21]:
df

Unnamed: 0,review,sentiment,review_pre
0,One of the other reviewers has mentioned that ...,1,one reviewer mention watch 1 Oz episode hook r...
1,A wonderful little production. <br /><br />The...,1,wonderful little production < br /><br />the f...
2,I thought this was a wonderful way to spend ti...,1,I think wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically family little boy Jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,Petter Mattei 's love Time money visually stun...
...,...,...,...
49995,I thought this movie did a down right good job...,1,I think movie right good job creative original...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,0,I Catholic teach parochial elementary school n...
49998,I'm going to have to disagree with the previou...,0,I go disagree previous comment side Maltin one...


In [32]:
# removing the "review" column

df = df.drop('review',axis = 1)

In [64]:
# performing train test split

X_train , X_test ,Y_train,Y_test = train_test_split(df.review_pre,df.sentiment, test_size=0.2,  random_state=42)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(40000,)
(40000,)
(10000,)
(10000,)


In [67]:
# pipeline for N-grams (BOW) / Bi-gram used for better accuracy

clf=Pipeline([
    ('vectorize_bow',CountVectorizer(ngram_range=(1,2))),
    # Logistic Regression with adjusted parameters
    ('log_reg',LogisticRegression(max_iter=1000))
])
clf.fit(X_train,Y_train)
y_pred=clf.predict(X_test)

In [68]:
accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9003


In [76]:
# pipeline for TF-IDF 
from sklearn.feature_extraction.text import TfidfVectorizer

clf_tfidf = Pipeline([
        # Convert text to TF-IDF features
        ('tfidf', TfidfVectorizer()),
    
         # Logistic Regression with adjusted parameters
        ('classifier', LogisticRegression(
            max_iter=1500,           # Increased iterations
        ))
    ])

In [77]:
clf_tfidf.fit(X_train,Y_train)
y_pred_tfidf=clf_tfidf.predict(X_test)

In [78]:
accuracy_tfidf = accuracy_score(Y_test, y_pred_tfidf)
print("Accuracy:", accuracy_tfidf)

Accuracy: 0.8918


In [98]:
# predicting single long review 

def predict_sentiment(text, pipeline):
    """
    Predict sentiment for a given text using the trained pipeline
    
    Parameters:
    text (str): Input text for sentiment prediction
    pipeline: Trained sklearn pipeline with vectorizer and classifier
    
    Returns:
    str: 'positive' or 'negative'
    """
    # Process the text
    processed_text = process(text)
    
    # Convert to list format
    text_list = [processed_text]
    
    # Make prediction using the pipeline
    prediction = pipeline.predict(text_list)[0]
    
    return "positive" if prediction == 1 else "negative"

In [106]:
# For single text (mixed positive and negative but generally negative)
text = "I really wanted to love this movie, especially considering the stellar cast and the massive $200 million budget that went into its production. The trailers promised an epic sci-fi adventure with groundbreaking special effects and an emotional storyline. Unfortunately, what we got was a disappointing mess of half-baked ideas and wasted potential. While the visual effects were admittedly impressive in some scenes, particularly during the space battle sequences, they couldn't make up for the incredibly confusing plot and poor character development. The first hour dragged painfully slow, with unnecessary subplots that went nowhere and dialogue that felt forced and unnatural. The main character's motivation made absolutely no sense, and the supposedly emotional scenes fell completely flat. Even the talented lead actors seemed lost in their roles, probably due to the inconsistent script and chaotic direction. The ending was particularly frustrating, leaving multiple plot threads unresolved and setting up an obvious sequel that I honestly have no interest in seeing. The soundtrack was generic and forgettable, unlike the composer's previous work. Sure, there were a few decent moments scattered throughout, and some of the action sequences were entertaining, but these brief highlights only served to remind me of what this film could have been. After three hours in the theater, I left feeling exhausted and disappointed. Save your money and wait for it to show up on streaming services, if you watch it at all. This is definitely one of the biggest letdowns of the year."
sentiment = predict_sentiment(text, clf)
print(f"Text: {text}")

print("----------------------------------------------------------------------------------------------------------------")

print(f"Sentiment: {sentiment}")

Text: I really wanted to love this movie, especially considering the stellar cast and the massive $200 million budget that went into its production. The trailers promised an epic sci-fi adventure with groundbreaking special effects and an emotional storyline. Unfortunately, what we got was a disappointing mess of half-baked ideas and wasted potential. While the visual effects were admittedly impressive in some scenes, particularly during the space battle sequences, they couldn't make up for the incredibly confusing plot and poor character development. The first hour dragged painfully slow, with unnecessary subplots that went nowhere and dialogue that felt forced and unnatural. The main character's motivation made absolutely no sense, and the supposedly emotional scenes fell completely flat. Even the talented lead actors seemed lost in their roles, probably due to the inconsistent script and chaotic direction. The ending was particularly frustrating, leaving multiple plot threads unreso

In [94]:
# predicting for several reviews

def predict_multiple_sentiments(texts, vectorizer, clf):
    """
    Predict sentiments for multiple texts using BOW vectorization
    
    Parameters:
    texts (list): List of texts to predict
    vectorizer: Fitted CountVectorizer from training
    clf: Trained classifier
    
    Returns:
    list: List of predictions ('positive' or 'negative')
    """
    # Process all texts
    processed_texts = [process(text) for text in texts]
    
    # Make predictions
    predictions = clf.predict(processed_texts)
    
    return ['positive' if pred == 1 else 'negative' for pred in predictions]

In [108]:
# For multiple texts
texts = [
    "This movie was amazing!",
    "I really hated this show so much",
    "The product works as expected"
]
results = predict_multiple_sentiments(texts, vecorizer, clf_tfidf)
for text, sentiment in zip(texts, results):
    print(f"Text: {text}")
    print(f"Sentiment: {sentiment}")
    print('------------------------------')

Text: This movie was amazing!
Sentiment: positive
------------------------------
Text: I really hated this show so much
Sentiment: negative
------------------------------
Text: The product works as expected
Sentiment: positive
------------------------------
