In [None]:
# Terminal commands

# pip install nltk 
# pip install spacy
# python -m spacy download en_core_web_sm

In [1]:
# Importing all the necessary libraries for simple pre-processing text

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import spacy

In [2]:
# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/viddesh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/viddesh/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/viddesh/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/viddesh/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
# Load Spacy model
nlp = spacy.load('en_core_web_sm')

In [4]:
# Sample text
text = "This is a sample text for demonstrating various text preprocessing techniques."

In [5]:
# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['This', 'is', 'a', 'sample', 'text', 'for', 'demonstrating', 'various', 'text', 'preprocessing', 'techniques', '.']


In [6]:
# Lowercase
lowercase_tokens = [token.lower() for token in tokens]
print("Lowercase Tokens:", lowercase_tokens)

Lowercase Tokens: ['this', 'is', 'a', 'sample', 'text', 'for', 'demonstrating', 'various', 'text', 'preprocessing', 'techniques', '.']


In [7]:
# Uppercase
uppercase_tokens = [token.upper() for token in tokens]
print("Uppercase Tokens:", uppercase_tokens)

Uppercase Tokens: ['THIS', 'IS', 'A', 'SAMPLE', 'TEXT', 'FOR', 'DEMONSTRATING', 'VARIOUS', 'TEXT', 'PREPROCESSING', 'TECHNIQUES', '.']


In [8]:
# Remove Stop Words
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print("Filtered Tokens (Stop Words Removed):", filtered_tokens)


Filtered Tokens (Stop Words Removed): ['sample', 'text', 'demonstrating', 'various', 'text', 'preprocessing', 'techniques', '.']


In [9]:
# Remove Noise (non-alphabetic characters)
noise_removed_tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens]
noise_removed_tokens = [token for token in noise_removed_tokens if token]  # Remove empty strings
print("Noise Removed Tokens:", noise_removed_tokens)

Noise Removed Tokens: ['This', 'is', 'a', 'sample', 'text', 'for', 'demonstrating', 'various', 'text', 'preprocessing', 'techniques']


In [10]:
# Parts of Speech Tagging
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('text', 'NN'), ('for', 'IN'), ('demonstrating', 'VBG'), ('various', 'JJ'), ('text', 'JJ'), ('preprocessing', 'NN'), ('techniques', 'NNS'), ('.', '.')]


In [11]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
print("Stemmed Tokens:", stemmed_tokens)

Stemmed Tokens: ['thi', 'is', 'a', 'sampl', 'text', 'for', 'demonstr', 'variou', 'text', 'preprocess', 'techniqu', '.']


In [12]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
print("Lemmatized Tokens (NLTK):", lemmatized_tokens)

Lemmatized Tokens (NLTK): ['This', 'is', 'a', 'sample', 'text', 'for', 'demonstrating', 'various', 'text', 'preprocessing', 'technique', '.']


In [13]:
# Lemmatization using Spacy
doc = nlp(text)
spacy_lemmatized_tokens = [token.lemma_ for token in doc]
print("Lemmatized Tokens (Spacy):", spacy_lemmatized_tokens)

Lemmatized Tokens (Spacy): ['this', 'be', 'a', 'sample', 'text', 'for', 'demonstrate', 'various', 'text', 'preprocesse', 'technique', '.']


In [14]:
# Terminal commands

# pip install numpy pandas scikit-learn

In [23]:
# Multinomial Naive Bayes classifier

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

# Sample data (increased size)
data = {
    'text': [
        'I love this movie', 'This movie is great', 'I hate this movie',
        'This movie is not good', 'I enjoyed the movie', 'The movie was terrible',
        'What a fantastic film', 'Not a fan of this movie', 'It was an okay movie',
        'The film was superb', 'Loved the plot', 'Terrible acting', 'Great direction',
        'The story was captivating', 'Worst movie ever', 'It was a wonderful experience',
        'Not worth watching', 'Absolutely fantastic', 'Could have been better', 'Mediocre film'
    ],
    'label': [
        'positive', 'positive', 'negative', 'negative', 'positive', 'negative',
        'positive', 'negative', 'neutral', 'positive', 'positive', 'negative',
        'positive', 'positive', 'negative', 'positive', 'negative', 'positive',
        'neutral', 'neutral'
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

# Features and Labels
X = df['text']
y = df['label']

# Stratified Split to ensure each class is represented equally
strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = next(strat_split.split(X, y))

X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Create a pipeline that combines vectorization and the Naive Bayes classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),            # Convert text to token counts
    ('tfidf', TfidfTransformer()),          # Transform token counts to TF-IDF features
    ('clf', MultinomialNB()),               # Multinomial Naive Bayes classifier
])

# Train the model
text_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = text_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)

print(f"Accuracy: {accuracy}")
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))

print()

# Detailed Classification Report
report = classification_report(y_test, y_pred, zero_division=0)
print("Classification Report:")
print(report)

print()

# Predict on new text
new_text = "I love this movie"
predicted_label_nb = text_clf.predict([new_text])
print(f"Naive Bayes Prediction for '{new_text}': {predicted_label_nb[0]}")

Accuracy: 0.75
Precision: 0.58
Recall: 0.75
F1 Score: 0.65

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         1
     neutral       0.00      0.00      0.00         1
    positive       0.67      1.00      0.80         2

    accuracy                           0.75         4
   macro avg       0.56      0.67      0.60         4
weighted avg       0.58      0.75      0.65         4


Naive Bayes Prediction for 'I love this movie': positive


In [25]:
# Decision Tree classifier

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

# Sample data (increased size)
data = {
    'text': [
        'I love this movie', 'This movie is great', 'I hate this movie',
        'This movie is not good', 'I enjoyed the movie', 'The movie was terrible',
        'What a fantastic film', 'Not a fan of this movie', 'It was an okay movie',
        'The film was superb', 'Loved the plot', 'Terrible acting', 'Great direction',
        'The story was captivating', 'Worst movie ever', 'It was a wonderful experience',
        'Not worth watching', 'Absolutely fantastic', 'Could have been better', 'Mediocre film'
    ],
    'label': [
        'positive', 'positive', 'negative', 'negative', 'positive', 'negative',
        'positive', 'negative', 'neutral', 'positive', 'positive', 'negative',
        'positive', 'positive', 'negative', 'positive', 'negative', 'positive',
        'neutral', 'neutral'
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

# Features and Labels
X = df['text']
y = df['label']

# Stratified Split to ensure each class is represented equally
strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = next(strat_split.split(X, y))

X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Create a pipeline that combines vectorization and the Decision Tree classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),            # Convert text to token counts
    ('tfidf', TfidfTransformer()),          # Transform token counts to TF-IDF features
    ('clf', DecisionTreeClassifier(random_state=42)),  # Decision Tree classifier
])

# Train the model
text_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = text_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)

print(f"Accuracy: {accuracy}")
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))

print()

# Detailed Classification Report
report = classification_report(y_test, y_pred, zero_division=0)
print("Classification Report:")
print(report)

print()

# Predict on new text
new_text = "I hate this movie"
predicted_label_nb = text_clf.predict([new_text])
print(f"Naive Bayes Prediction for '{new_text}': {predicted_label_nb[0]}")


Accuracy: 0.25
Precision: 0.25
Recall: 0.25
F1 Score: 0.25

Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         1
     neutral       0.00      0.00      0.00         1
    positive       0.50      0.50      0.50         2

    accuracy                           0.25         4
   macro avg       0.17      0.17      0.17         4
weighted avg       0.25      0.25      0.25         4


Naive Bayes Prediction for 'I hate this movie': negative


In [None]:
# Sentiment analysis using textblob
# pip install textblob
# pip install vaderSentiment

In [32]:
from textblob import TextBlob

def get_textblob_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'


# Sample text data
texts = [
    'I love this movie!',
    'This movie is terrible...',
    'I enjoyed the movie very much.',
    'Not a fan of this movie.',
    'The plot was fantastic and the acting superb!',
    'Worst movie ever.',
    'Absolutely fantastic!',
    'Could have been better.',
    'It was an okay movie.',
    'Loved the plot and the characters.'
]

# Perform sentiment analysis
for text in texts:
    blob = TextBlob(text)
    sentiment = blob.sentiment
    print(f"Text: {text}\nPolarity: {sentiment.polarity}, Subjectivity: {sentiment.subjectivity}")
    sentiment = get_textblob_sentiment(text)
    print(f"Sentiment: {sentiment}\n")


Text: I love this movie!
Polarity: 0.625, Subjectivity: 0.6
Sentiment: positive

Text: This movie is terrible...
Polarity: -1.0, Subjectivity: 1.0
Sentiment: negative

Text: I enjoyed the movie very much.
Polarity: 0.38, Subjectivity: 0.48
Sentiment: positive

Text: Not a fan of this movie.
Polarity: 0.0, Subjectivity: 0.0
Sentiment: neutral

Text: The plot was fantastic and the acting superb!
Polarity: 0.4666666666666666, Subjectivity: 0.6333333333333333
Sentiment: positive

Text: Worst movie ever.
Polarity: -1.0, Subjectivity: 1.0
Sentiment: negative

Text: Absolutely fantastic!
Polarity: 0.5, Subjectivity: 0.9
Sentiment: positive

Text: Could have been better.
Polarity: 0.5, Subjectivity: 0.5
Sentiment: positive

Text: It was an okay movie.
Polarity: 0.5, Subjectivity: 0.5
Sentiment: positive

Text: Loved the plot and the characters.
Polarity: 0.7, Subjectivity: 0.8
Sentiment: positive



In [33]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def get_vader_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Sample text data
texts = [
    'I love this movie!',
    'This movie is terrible...',
    'I enjoyed the movie very much.',
    'Not a fan of this movie.',
    'The plot was fantastic and the acting superb!',
    'Worst movie ever.',
    'Absolutely fantastic!',
    'Could have been better.',
    'It was an okay movie.',
    'Loved the plot and the characters.'
]

# Perform sentiment analysis
for text in texts:
    scores = analyzer.polarity_scores(text)
    print(f"Text: {text}\nScores: {scores}")
    sentiment = get_vader_sentiment(text)
    print(f"Sentiment: {sentiment}\n")


Text: I love this movie!
Scores: {'neg': 0.0, 'neu': 0.4, 'pos': 0.6, 'compound': 0.6696}
Sentiment: positive

Text: This movie is terrible...
Scores: {'neg': 0.508, 'neu': 0.492, 'pos': 0.0, 'compound': -0.4767}
Sentiment: negative

Text: I enjoyed the movie very much.
Scores: {'neg': 0.0, 'neu': 0.602, 'pos': 0.398, 'compound': 0.5106}
Sentiment: positive

Text: Not a fan of this movie.
Scores: {'neg': 0.282, 'neu': 0.718, 'pos': 0.0, 'compound': -0.2411}
Sentiment: negative

Text: The plot was fantastic and the acting superb!
Scores: {'neg': 0.0, 'neu': 0.429, 'pos': 0.571, 'compound': 0.8398}
Sentiment: positive

Text: Worst movie ever.
Scores: {'neg': 0.672, 'neu': 0.328, 'pos': 0.0, 'compound': -0.6249}
Sentiment: negative

Text: Absolutely fantastic!
Scores: {'neg': 0.0, 'neu': 0.193, 'pos': 0.807, 'compound': 0.6352}
Sentiment: positive

Text: Could have been better.
Scores: {'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}
Sentiment: positive

Text: It was an okay m