In [1]:
pip install scikit-learn nltk



In [27]:
import nltk                                             #Natural language tooolkit for NLP tasks/Preprocessing text data
nltk.download('punkt')
import string                                           # import string module for string operations
import re                                               # Import regular expressions module

# Import stopwords corpus from nltk to remove stopwords (the,is,are etc) during preprocessing
from nltk.corpus import stopwords

# Import TfidfVectorizer class from the sklearn.feature_extraction.text module
# TfidfVectorizer is a utility class in SciKit-learn(sklearn) module for converting text into a matrix of TF-IDF features
# TfidfVectorizer converts each doc into feature vector cz ML understand no. not text.
# TfidfVectorizer is commonly used as a preprocessing step in machine learning pipelines
from sklearn.feature_extraction.text import TfidfVectorizer

# for splitting the dataset into train and test sets to evaluate performance
from sklearn.model_selection import train_test_split

# Pipeline chains multiple processing steps (like data preprocessing, feature extraction, and model training)together into a single object
from sklearn.pipeline import Pipeline

# Import Special vector classification class from sklearn.svm
# Supervised learning algorithm used for classification tasks.
from sklearn.svm import SVC

# import the RandomForestClassifier class from the sklearn.ensemble module
# Random Forest is an ensemble learning method that constructs multiple decision trees during training
# Its output is the mode of the classes (classification) or the mean prediction (regression) of individual trees.
from sklearn.ensemble import RandomForestClassifier

# To calculate the accuarcy and print a report
from sklearn.metrics import accuracy_score, classification_report

# import movie_reviews corpus from nltk
from nltk.corpus import movie_reviews

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
#Loading the dataset movie_reviews
nltk.download ('movie_reviews')

# Nested loop. Outer loop iterating over (file id, and category) using the movie_reviews.categories() function.
# This function returns a list of categories, which in this case are 'pos' (positive) and 'neg' (negative).

#Inner loop that iterates over each file id associated with the current category movie_reviews.fileids(category) function
# returns a list of file IDs for a given category.
documents = [(movie_reviews.raw(fileid), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

# Seperate the reviews and labels

# Creates a list [array] X containing only the raw text of the movie reviews from the documents list
X = [review for review, _ in documents]

# Creates a list Y containing only the categories (labels) of the movie reviews from the documents list
Y = [category for _, category in documents]


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [29]:
nltk.download('wordnet')       #Used for lemmatization
#Downloads the stopwords corpus to be used in preprocessing
nltk.download('stopwords')

# Text preprocessing function
def preprocess_text(text):
    # Tokenization: Split the text into words or tokens
    # This is an assignemtn statement. Tokens is a variable.
    # This function call will invoke tokenization method word_tokenize() in the nltk
    tokens = word_tokenize(text)

    # Lowercasing: Convert all tokens to lowercase
    # This is an assignemtn statement. Tokens is again a variable
    # This is a list [] comprehension statement, which iterates over each element of token in tokens list
    # It calls the lower() method on each token
    tokens = [token.lower() for token in tokens]

    # Remove punctuation: Remove punctuation tokens
    # Assignment statement which iterates over each token in tokens list and includes it in the new tokens lists if they are not punctuation marks
    tokens = [token for token in tokens if token not in string.punctuation]

    # Remove stopwords: Remove common stopwords

    # Calls the words() function in the stopwords module in nltk.
    # this function returns a list of english keywords
    stopwords_list = set(stopwords.words('english'))

    # Assignment st. iterating over each token in tokens list and includes the tokens in new list only if they are not stopwords
    tokens = [token for token in tokens if token not in stopwords_list]

    # Lemmatization: Convert tokens to their base form
    # wordnetlemmatizer is a class. We are creating an object or instance 'lemmatizer' for the class
    lemmatizer = WordNetLemmatizer()
    # Assignment st that iterates on each token in tokens list.
    # it calls lemmatizer() function for each token
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into text
    # Join() is a function. We are joining each token into a single string seperated by a single space ' '
    preprocessed_text = ' '.join(tokens)

    # returns the single string just created
    return preprocessed_text

# Preprocess the text data
# this is list comprehension, a concise way to apply an operation to each element in the list.
# it is iterating over each text in list X asking to implement each preprocessing step defined above
X = [preprocess_text(text) for text in X]

# Split the data into train and test set
#Train_test_split() is a function. We have passed X, Y lists into it. Test dateset size is 20%.
# We are then assigning the 4 split datasets to 4 variables.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Define the pipeline
# initializes Pipeline() object, which applies one step sequencially and feeds the output as an input to the next step
# tfidf is the first step. It consists of TfidfVectorizer() function which converts text data into matrix of tf-idf features.
# Text Feature extraction.
# clf is the next step. it calls special vector classifier SVC() fucntion with linear kernel
# The classifier component (clf) of the pipeline is responsible for learning patterns from the input features (TF-IDF vectors)
# and making predictions or decisions based on those patterns.
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
    ('clf', SVC(kernel='linear'))
])

# Train the model
# Calling fit() method in the pipeline object. it applies each step in the pipeline to the passed variables
pipeline.fit(X_train, Y_train)

# Make predictions
# Calling predict() method in pipeline object. It applies trained pipeline to X_test and generate patterns based on learned patterns.
Y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print ("Accuracy=", accuracy)

# Display classification report
print ("Classification report:")
print (classification_report(Y_test, Y_pred))




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy= 0.8375
Classification report:
              precision    recall  f1-score   support

         neg       0.85      0.82      0.83       199
         pos       0.83      0.86      0.84       201

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400



In [15]:
from collections import Counter

# Count the number of positive and negative reviews
review_counts = Counter(Y)
print("Review Distribution:", review_counts)

Review Distribution: Counter({'neg': 1000, 'pos': 1000})


In [30]:
import nltk
import string
import re
from nltk.corpus import stopwords, movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Download the movie_reviews dataset
nltk.download('movie_reviews')

# Load the movie reviews
documents = [(movie_reviews.raw(fileid), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Separate the reviews and labels
X = [review for review, _ in documents]
y = [category for _, category in documents]

# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stopwords
    stopwords_list = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stopwords_list)
    # Remove any additional whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Preprocess the text data
X = [preprocess_text(text) for text in X]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000)),
    ('clf', SVC())
])

# Define the hyperparameters grid for grid search
param_grid = {
    'clf__C': [0.1, 1, 10],
    'clf__kernel': ['linear', 'rbf']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters found by grid search
print("Best Parameters:", grid_search.best_params_)

# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Best Parameters: {'clf__C': 1, 'clf__kernel': 'linear'}
Accuracy: 0.84
Classification Report:
              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       199
         pos       0.83      0.85      0.84       201

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400

