<a href="https://colab.research.google.com/github/SobiaNoorAI/Data-Seekho-Data-Science-Mastery-Program/blob/main/Module%205%3A%20NLP/Sentiment_Analysis_by_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis

# 1) Text Cleaning & Preprocessing

## Remove Unnecessary Text


*   Non-Alphabet Characters
*   URLs
*   Extra Spaces



In [19]:
# import re module for regular Expression
import re

def clean_text(text):
    # Remove non-alphabet characters (except spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # convert text to lowercase
    text = text.lower()

    return text

text = ("Welcome to Wallerobot 2025."," Visit us  https://www.wallerobot.com and http://sobianoor.net .","  This is an   NLP  book. ","We sold 550 Book this year. ", "We are not listed in top 100 tech giant.","We do not have much followers.","We use special characters like !@#$%^&*()_+=-`~[]\{}|;':\",./<>?","we journey from Wasteland to Wonderland.", "We are exploring the fields of AI, Robotics, and Data Science."," Join our world to tour the Future of Tech.")
text = ' '.join(text) # Join the elements of the tuple with a space

cleaned_text = clean_text(text)
cleaned_text


'welcome to wallerobot visit us and this is an nlp book we sold book this year we are not listed in top tech giant we do not have much followers we use special characters like we journey from wasteland to wonderland we are exploring the fields of ai robotics and data science join our world to tour the future of tech'

## Tokenization

splits the text into smaller units called tokens

In [20]:
# import NLP Toolkit
import nltk
# model
nltk.download('punkt_tab')
# import word tokenizer
from nltk.tokenize import word_tokenize

# apply tokenizer on text
tokens = word_tokenize(cleaned_text)
print(tokens)

['welcome', 'to', 'wallerobot', 'visit', 'us', 'and', 'this', 'is', 'an', 'nlp', 'book', 'we', 'sold', 'book', 'this', 'year', 'we', 'are', 'not', 'listed', 'in', 'top', 'tech', 'giant', 'we', 'do', 'not', 'have', 'much', 'followers', 'we', 'use', 'special', 'characters', 'like', 'we', 'journey', 'from', 'wasteland', 'to', 'wonderland', 'we', 'are', 'exploring', 'the', 'fields', 'of', 'ai', 'robotics', 'and', 'data', 'science', 'join', 'our', 'world', 'to', 'tour', 'the', 'future', 'of', 'tech']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# 2) Convert Text into Features (BoW & TF-IDF)

Machine Learning Models work only on numeric data

## Bag of Words (BoW)

A text modeling technique that counts the frequency of each word in a document



*   It follow no order of words like things in a bag



In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the cleaned text
X = vectorizer.fit_transform([cleaned_text])

# BoW representation as a dense array
print(X.toarray())

# Feature names (words)
print(vectorizer.get_feature_names_out())

[[1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 2 2 3 1 1
  1 1 1 1 1 6 1 1 1 1]]
['ai' 'an' 'and' 'are' 'book' 'characters' 'data' 'do' 'exploring'
 'fields' 'followers' 'from' 'future' 'giant' 'have' 'in' 'is' 'join'
 'journey' 'like' 'listed' 'much' 'nlp' 'not' 'of' 'our' 'robotics'
 'science' 'sold' 'special' 'tech' 'the' 'this' 'to' 'top' 'tour' 'us'
 'use' 'visit' 'wallerobot' 'wasteland' 'we' 'welcome' 'wonderland'
 'world' 'year']


## TF-IDF (Term Frequency – Inverse Document Frequency)

A text modeling technique measures the importance of each word in a document


*   TF-IDF gives more weight to unique words

In [5]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the cleaned text
tfidf_matrix = tfidf_vectorizer.fit_transform([cleaned_text])

# TF-IDF representation as a dense array
print(tfidf_matrix.toarray())

# Feature names (words)
print(tfidf_vectorizer.get_feature_names_out())

[[0.09407209 0.09407209 0.18814417 0.18814417 0.18814417 0.09407209
  0.09407209 0.09407209 0.09407209 0.09407209 0.09407209 0.09407209
  0.09407209 0.09407209 0.09407209 0.09407209 0.09407209 0.09407209
  0.09407209 0.09407209 0.09407209 0.09407209 0.09407209 0.18814417
  0.18814417 0.09407209 0.09407209 0.09407209 0.09407209 0.09407209
  0.18814417 0.18814417 0.18814417 0.28221626 0.09407209 0.09407209
  0.09407209 0.09407209 0.09407209 0.09407209 0.09407209 0.56443252
  0.09407209 0.09407209 0.09407209 0.09407209]]
['ai' 'an' 'and' 'are' 'book' 'characters' 'data' 'do' 'exploring'
 'fields' 'followers' 'from' 'future' 'giant' 'have' 'in' 'is' 'join'
 'journey' 'like' 'listed' 'much' 'nlp' 'not' 'of' 'our' 'robotics'
 'science' 'sold' 'special' 'tech' 'the' 'this' 'to' 'top' 'tour' 'us'
 'use' 'visit' 'wallerobot' 'wasteland' 'we' 'welcome' 'wonderland'
 'world' 'year']


# 3) Train and Test a Model for Sentiment Analysis

## Split Dataset
Split the data into two parts:
* 70% for training
* 30% for testing

In [7]:

from sklearn.model_selection import train_test_split
# X from BoW is input, labels is Output
# 1 = Positive, 0 = Negative Sentence
labels = [1, 1, 0, 0,0,0,1,1,1,1]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

X_test.shape
X_train.shape

ValueError: Found input variables with inconsistent numbers of samples: [1, 10]

In [12]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

text = ("Welcome to Wallerobot 2025."," Visit us  https://www.wallerobot.com and http://sobianoor.net .","  This is an   NLP  book. ","We sold 550 Book this year. ", "We are not listed in top 100 tech giant.","We do not have much followers.","We use special characters like !@#$%^&*()_+=-`~[]\{}|;':\",./<>?","we journey from Wasteland to Wonderland.", "We are exploring the fields of AI, Robotics, and Data Science."," Join our world to tour the Future of Tech.")

# Split the text into sentences
sentences = sent_tokenize(' '.join(text))  # Join the elements of the tuple with a space

# Clean each sentence
cleaned_sentences = [clean_text(sentence) for sentence in sentences]

# Apply CountVectorizer to the cleaned sentences
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(cleaned_sentences)  # Now X will have 10 rows, one for each sentence

# Rest of the code remains the same
from sklearn.model_selection import train_test_split
# 1 = Positive, 0 = Negative Sentence
# Adjust the number of labels to match the number of sentences - X has shape (10, n_features) and labels has 10 elements
labels = [1, 1, 0, 0, 0, 0, 1, 1, 1, 1,0]  # Adjusted the number of labels to 10 to match the number of sentences

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

X_test.shape
X_train.shape

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


(8, 46)

## Train Model

In [16]:
from sklearn.linear_model import LogisticRegression # Import the LogisticRegression class

# Train Model
model = LogisticRegression()
model.fit(X_train, y_train)


## Evaluate Model

In [17]:
# Predict
lr_predictions = model.predict(X_test)
print(lr_predictions)

[1 1 1]


In [18]:
from sklearn.metrics import accuracy_score, classification_report

# Accuracy
accuracy = accuracy_score(y_test, lr_predictions)
from sklearn.metrics import accuracy_score, classification_report

# Accuracy
accuracy = accuracy_score(y_test, lr_predictions)
print("Accuracy:", accuracy)
# Classification report
print("Classification Report:\n", classification_report(y_test, lr_predictions))

Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
