#### **Project Title:**  ***"Text Sentiment Analysis"***
##### **Dataset:** *" IMDB Reviews Dataset"*
##### **Step 01:** Loading Important Libraries

In [None]:
#For data manipulation
import pandas as pd
#For natural language processing tasks.
import nltk
#Tokenizer for splitting text into words.
from nltk.tokenize import word_tokenize
#List of common words to remove (e.g., "the", "and").  
from nltk.corpus import stopwords  
#Lemmatizer to normalize words (e.g., "running" -> "run").
from nltk.stem import WordNetLemmatizer  
#For removing punctuation.
import string  
#To convert text data into numerical features using TF-IDF weighting.
from sklearn.feature_extraction.text import TfidfVectorizer
#To split the dataset into training and testing sets.
from sklearn.model_selection import train_test_split
#To convert text data into numerical features using TF-IDF.
from sklearn.feature_extraction.text import TfidfVectorizer
#To build a logistic regression classifier for sentiment analysis.
from sklearn.linear_model import LogisticRegression
#To build a Naive Bayes classifier for text classification.
from sklearn.naive_bayes import MultinomialNB
#To evaluate the model's performance (precision, recall, F1-score).
from sklearn.metrics import classification_report

##### **Step 02:** Loading Dataset

In [4]:
#Loading the IMDB Reviews dataset.
df = pd.read_csv("IMDB Reviews.csv")
#Displaying the first few rows of IMDB Reviews dataset.
print(df.head())

   Unnamed: 0       Movie                                        Review Text  \
0           0  Ex Machina  Intelligent Movie.\nThis movie is obviously al...   
1           1  Ex Machina  Extraordinary and thought-provoking.\n'Ex mach...   
2           2  Ex Machina  Poor story, only reasonable otherwise.\nIf I h...   
3           3  Ex Machina  Had Great Potential.\nThis movie is one of the...   
4           4    Eternals  Amazing visuals and philosophical concepts!\n\...   

   IMDb Rating  
0            9  
1           10  
2            3  
3            1  
4           10  


##### **Step 03:** Text Preprocessing

In [None]:
#Downloading necessary NLTK resources.
#Word tokenizer dataset.
nltk.download("punkt")  
#Stopwords dataset.
nltk.download("stopwords")  
#WordNet dataset for lemmatization.
nltk.download("wordnet")  
#For advanced tokenization.
nltk.download('punkt_tab')
#Initializing lemmatizer and stopwords list.
lemmatizer = WordNetLemmatizer()
#Converting stopwords to a set for faster lookup.
stop_words = set(stopwords.words("english"))  
def preprocess_text(text):
    #Converting text to lowercase and tokenize into words.
    tokens = word_tokenize(text.lower())
    #Removing stopwords and punctuation.
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    #Performing lemmatization (normalize words to their base form).
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    #Returning processed text as a single string.
    return " ".join(tokens)
#Applying preprocessing function to the "Review Text" column.
df["Processed Review"] = df["Review Text"].apply(preprocess_text)
#Displaying first few rows of processed reviews.
print(df[["Movie", "Processed Review", "IMDb Rating"]].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wajeeha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wajeeha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wajeeha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\wajeeha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


        Movie                                   Processed Review  IMDb Rating
0  Ex Machina  intelligent movie movie obviously allegorical ...            9
1  Ex Machina  extraordinary thought-provoking 'ex machina su...           10
2  Ex Machina  poor story reasonable otherwise realised alex ...            3
3  Ex Machina  great potential movie one countless come holly...            1
4    Eternals  amazing visuals philosophical concept eternals...           10


##### **Step 04:** Text Preprocessing

In [None]:
#Initializing the TF-IDF Vectorizer.
#Adjusting max_features if needed to control the number of features.
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
#Fit the vectorizer on the processed reviews and transform the text data into a TF-IDF matrix.
X = tfidf_vectorizer.fit_transform(df["Processed Review"])
#Printing the shape of the resulting TF-IDF matrix.
print("TF-IDF matrix shape:", X.shape)

TF-IDF matrix shape: (6, 572)


##### **Step 05:** Model Training

In [None]:
#Creating binary sentiment labels:
#Here, reviews with an IMDb Rating >= 5 are labeled as positive (1) and < 5 as negative (0).
df['Sentiment'] = df['IMDb Rating'].apply(lambda x: 1 if x >= 5 else 0)
#Initializing the TF-IDF Vectorizer (using the already preprocessed text).
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df["Processed Review"])
y = df['Sentiment']
#Splitting the data into training and testing sets (e.g., 67% train, 33% test).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg))
# Option 2: Naive Bayes (MultinomialNB)
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##### **Step 06:** Model Evaluation

In [None]:
#Evaluating Logistic Regression model.
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg))
#Evaluating Naive Bayes model.
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### **Conclusion**
In this project, we built a **sentiment analysis pipeline** using the *IMDB Reviews* dataset. The process involved several key stages:
- **Text Preprocessing:**  
  - **Tokenization:** Splitting text into individual words  
  - **Stopword Removal:** Filtering out common, non-informative words  
  - **Lemmatization:** Converting words to their base form
- **Feature Engineering:**  
  The preprocessed text was transformed into numerical features using **TF-IDF**, which highlights the importance of words within the corpus.
- **Model Training:**  
  Both **Logistic Regression** and **Naive Bayes** classifiers were trained to predict sentiment from the numerical features.
- **Model Evaluation:**  
  The models were evaluated using metrics like precision, recall, and F1-score. Although the initial performance was low—likely due to limited data and class imbalances—it provided valuable insights into areas for improvement.

This project lays a solid foundation for further experimentation with advanced techniques and model optimization. Future work may include expanding the dataset, addressing class imbalances and exploring alternative feature extraction methods.
***