# Task 7: Introduction to Natural Language (Text) Processing

## Section 1: Setup and Sample Dataset

### **Task 1**: Import Libraries and Sample Data
*Instruction*: Import the necessary libraries and define a sample dataset for sentiment classification.

In [1]:
import pandas as pd
import numpy as np
import nltk
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('punkt')
nltk.download('stopwords')

# Sample data
data = {
    'text': [
        'I love this movie. It was fantastic!',
        'Terrible acting and horrible plot.',
        'An excellent film with great characters.',
        'Worst movie I have ever seen.',
        'Absolutely wonderful! A must-watch.',
        'It was okay, nothing special.',
        'Bad movie, waste of time.',
        'Pretty good, I liked it.',
        'Not great, but not terrible.',
        'Awful! Never again.'
    ],
    'label': [1, 0, 1, 0, 1, 1, 0, 1, 0, 0]  # 1 = positive, 0 = negative
}

df = pd.DataFrame(data)
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,text,label
0,I love this movie. It was fantastic!,1
1,Terrible acting and horrible plot.,0
2,An excellent film with great characters.,1
3,Worst movie I have ever seen.,0
4,Absolutely wonderful! A must-watch.,1


## Section 2: Text Preprocessing

### **Task 2**: Clean the Text

*Instruction*: Lowercase the text, remove punctuation, stopwords, and tokenize the sentences.


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk

# Assuming you have the 'cleaned_sentences' from the previous step
# If not, you'll need to run the text cleaning code first.
# For demonstration, let's create a sample cleaned_sentences list:
cleaned_sentences = [['hello', 'sample', 'sentence', 'punctuation', 'common', 'words', 'remove'],
                     ['quite', 'interesting']]

# --- 1. Bag of Words (BoW) Vectorization ---
# First, we need to flatten the list of lists of words back into a list of strings (sentences)
# for the vectorizers.
flat_cleaned_sentences = [" ".join(sentence) for sentence in cleaned_sentences]

# Initialize the CountVectorizer
bow_vectorizer = CountVectorizer()

# Fit and transform the cleaned sentences
bow_features = bow_vectorizer.fit_transform(flat_cleaned_sentences)

# Get the feature names (words in the vocabulary)
bow_feature_names = bow_vectorizer.get_feature_names_out()

# --- Output for Bag of Words ---
print("--- Bag of Words (BoW) Vectorization ---")
print("\nVocabulary (Feature Names):")
print(bow_feature_names)

print("\nBoW Features (Sparse Matrix):")
print(bow_features)

print("\nBoW Features (Dense Array - for better readability, but be cautious with large datasets):")
print(bow_features.toarray())

# --- 2. TF-IDF Vectorization ---
# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the cleaned sentences
tfidf_features = tfidf_vectorizer.fit_transform(flat_cleaned_sentences)

# Get the feature names (words in the vocabulary)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# --- Output for TF-IDF ---
print("\n\n--- TF-IDF Vectorization ---")
print("\nVocabulary (Feature Names):")
print(tfidf_feature_names)

print("\nTF-IDF Features (Sparse Matrix):")
print(tfidf_features)

print("\nTF-IDF Features (Dense Array - for better readability, but be cautious with large datasets):")
print(tfidf_features.toarray())

--- Bag of Words (BoW) Vectorization ---

Vocabulary (Feature Names):
['common' 'hello' 'interesting' 'punctuation' 'quite' 'remove' 'sample'
 'sentence' 'words']

BoW Features (Sparse Matrix):
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (2, 9)>
  Coords	Values
  (0, 1)	1
  (0, 6)	1
  (0, 7)	1
  (0, 3)	1
  (0, 0)	1
  (0, 8)	1
  (0, 5)	1
  (1, 4)	1
  (1, 2)	1

BoW Features (Dense Array - for better readability, but be cautious with large datasets):
[[1 1 0 1 0 1 1 1 1]
 [0 0 1 0 1 0 0 0 0]]


--- TF-IDF Vectorization ---

Vocabulary (Feature Names):
['common' 'hello' 'interesting' 'punctuation' 'quite' 'remove' 'sample'
 'sentence' 'words']

TF-IDF Features (Sparse Matrix):
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (2, 9)>
  Coords	Values
  (0, 1)	0.3779644730092272
  (0, 6)	0.3779644730092272
  (0, 7)	0.3779644730092272
  (0, 3)	0.3779644730092272
  (0, 0)	0.3779644730092272
  (0, 8)	0.377964473

## Section 3: Text Vectorization

### **Task 3**: Convert Text to Numerical Features

*Instruction*: Use both Bag of Words and TF-IDF vectorization to convert the cleaned text.


In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Assume you have cleaned sentences as a list of strings
# If your cleaned sentences are a list of lists of words,
# uncomment and run the following to convert them:
# cleaned_sentences_lists = [['hello', 'sample', 'sentence', 'punctuation', 'common', 'words', 'remove'],
#                            ['quite', 'interesting']]
# cleaned_sentences = [" ".join(sentence) for sentence in cleaned_sentences_lists]

# For demonstration, let's use some sample cleaned sentences as a list of strings
cleaned_sentences = [
    "hello sample sentence punctuation common words remove",
    "quite interesting"
]

# --- Bag of Words (BoW) Vectorization ---
# Initialize the CountVectorizer
bow_vectorizer = CountVectorizer()

# Fit and transform the cleaned sentences
bow_features = bow_vectorizer.fit_transform(cleaned_sentences)

# Get the feature names (words in the vocabulary)
bow_feature_names = bow_vectorizer.get_feature_names_out()

# --- Output for Bag of Words ---
print("--- Bag of Words (BoW) Vectorization ---")
print("\nVocabulary (Feature Names):")
print(bow_feature_names)

print("\nBoW Features (Sparse Matrix):")
print(bow_features)

print("\nBoW Features (Dense Array - for better readability, but use .toarray() with caution on large datasets):")
print(bow_features.toarray())

# --- TF-IDF Vectorization ---
# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the cleaned sentences
tfidf_features = tfidf_vectorizer.fit_transform(cleaned_sentences)

# Get the feature names (words in the vocabulary)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# --- Output for TF-IDF ---
print("\n\n--- TF-IDF Vectorization ---")
print("\nVocabulary (Feature Names):")
print(tfidf_feature_names)

print("\nTF-IDF Features (Sparse Matrix):")
print(tfidf_features)

print("\nTF-IDF Features (Dense Array - for better readability, but use .toarray() with caution on large datasets):")
print(tfidf_features.toarray())

--- Bag of Words (BoW) Vectorization ---

Vocabulary (Feature Names):
['common' 'hello' 'interesting' 'punctuation' 'quite' 'remove' 'sample'
 'sentence' 'words']

BoW Features (Sparse Matrix):
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (2, 9)>
  Coords	Values
  (0, 1)	1
  (0, 6)	1
  (0, 7)	1
  (0, 3)	1
  (0, 0)	1
  (0, 8)	1
  (0, 5)	1
  (1, 4)	1
  (1, 2)	1

BoW Features (Dense Array - for better readability, but use .toarray() with caution on large datasets):
[[1 1 0 1 0 1 1 1 1]
 [0 0 1 0 1 0 0 0 0]]


--- TF-IDF Vectorization ---

Vocabulary (Feature Names):
['common' 'hello' 'interesting' 'punctuation' 'quite' 'remove' 'sample'
 'sentence' 'words']

TF-IDF Features (Sparse Matrix):
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9 stored elements and shape (2, 9)>
  Coords	Values
  (0, 1)	0.3779644730092272
  (0, 6)	0.3779644730092272
  (0, 7)	0.3779644730092272
  (0, 3)	0.3779644730092272
  (0, 0)	0.3779644730092272
  (0, 

## Section 4: Train a Classifier

### **Task 4**: Sentiment Classification with Naive Bayes

*Instruction*: Split the dataset, train a classifier using both feature sets, and evaluate the performance.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Assume you have the following from the previous step:
# bow_features, tfidf_features, bow_feature_names, tfidf_feature_names

# --- Create Sample Labels (REPLACE with your actual labels) ---
# Make sure the number of labels matches the number of sentences
labels = np.array([1, 0]) # 1 for positive, 0 for negative

# --- 1. Sentiment Classification with Bag of Words Features ---

print("--- Sentiment Classification with Bag of Words ---")

# Split the data into training and testing sets
X_train_bow, X_test_bow, y_train, y_test = train_test_split(
    bow_features, labels, test_size=0.3, random_state=42
)

# Initialize the Naive Bayes classifier (MultinomialNB is suitable for text data)
bow_model = MultinomialNB()

# Train the model
bow_model.fit(X_train_bow, y_train)

# Make predictions on the test set
y_pred_bow = bow_model.predict(X_test_bow)

# Evaluate the model's performance
bow_accuracy = accuracy_score(y_test, y_pred_bow)
print(f"Accuracy (BoW): {bow_accuracy}")

print("\nClassification Report (BoW):")
print(classification_report(y_test, y_pred_bow))

# --- 2. Sentiment Classification with TF-IDF Features ---

print("\n--- Sentiment Classification with TF-IDF ---")

# Split the data into training and testing sets
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    tfidf_features, labels, test_size=0.3, random_state=42
)

# Initialize the Naive Bayes classifier (MultinomialNB is suitable for text data)
tfidf_model = MultinomialNB()

# Train the model
tfidf_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred_tfidf = tfidf_model.predict(X_test_tfidf)

# Evaluate the model's performance
tfidf_accuracy = accuracy_score(y_test, y_pred_tfidf)
print(f"Accuracy (TF-IDF): {tfidf_accuracy}")

print("\nClassification Report (TF-IDF):")
print(classification_report(y_test, y_pred_tfidf))

--- Sentiment Classification with Bag of Words ---
Accuracy (BoW): 0.0

Classification Report (BoW):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0


--- Sentiment Classification with TF-IDF ---
Accuracy (TF-IDF): 0.0

Classification Report (TF-IDF):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Section 5: Mini Challenge – Classify Your Own Text

### **Task 5**:  User Input Prediction

*Instruction*: Write a function that allows the user to enter a text and receive a prediction from the trained model.


In [None]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# Download necessary NLTK resources if not already downloaded
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    sent_tokenize("This is a test.")
except LookupError:
    nltk.download('punkt')

def clean_text(text):
    """
    Cleans the input text by:
    1. Converting to lowercase.
    2. Removing punctuation.
    3. Removing stopwords.
    """
    text_lower = text.lower()
    text_no_punctuation = ''.join([char for char in text_lower if char not in string.punctuation])
    words = text_no_punctuation.split()
    stop_words = set(stopwords.words('english'))
    words_no_stopwords = [word for word in words if word not in stop_words]
    return " ".join(words_no_stopwords)

# --- Simulate a Trained Model and Vectorizer ---
# In a real application, you would load these from saved files.
# For this example, we'll train a very simple one on some dummy data.

# Dummy training data
train_texts = [
    "This movie is fantastic!",
    "I really enjoyed the performance.",
    "The service was terrible.",
    "What a disappointing experience.",
    "Highly recommended to everyone.",
    "Absolutely the worst."
]
train_labels = np.array([1, 1, 0, 0, 1, 0]) # 1 for positive, 0 for negative

# Clean the training texts
cleaned_train_texts = [clean_text(text) for text in train_texts]

# Use TfidfVectorizer for this example
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(cleaned_train_texts)

# Train a Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vectorized, train_labels)

# Define the class labels
class_labels = ['negative', 'positive']

# --- Prediction Function ---

def predict_sentiment(text, model, vectorizer, class_labels):
    """
    Predicts the sentiment of the input text.

    Args:
        text (str): The input text from the user.
        model: The trained sentiment classification model.
        vectorizer: The fitted vectorizer.
        class_labels (list): A list of strings representing the class labels.

    Returns:
        str: The predicted sentiment label.
    """
    cleaned_text = clean_text(text)
    text_vectorized = vectorizer.transform([cleaned_text])
    prediction = model.predict(text_vectorized)
    predicted_class_index = prediction[0]
    predicted_class_label = class_labels[predicted_class_index]
    return f"The predicted sentiment is: {predicted_class_label}"

# --- Get User Input and Make Prediction ---

# Code
user_input = input("Enter the text you want to analyze: ")

# Make the prediction
predicted_sentiment = predict_sentiment(user_input, model, vectorizer, class_labels)

# Output the prediction
print(predicted_sentiment)

# Text (Example Interactions)
print("\n--- Example Interactions ---")

# Example 1
user_text_1 = "I had a wonderful time at the party."
prediction_1 = predict_sentiment(user_text_1, model, vectorizer, class_labels)
print(f"Input: '{user_text_1}' -> {prediction_1}")

# Example 2
user_text_2 = "This is absolutely the worst service I've ever received."
prediction_2 = predict_sentiment(user_text_2, model, vectorizer, class_labels)
print(f"Input: '{user_text_2}' -> {prediction_2}")

# Example 3
user_text_3 = "It was okay, nothing special."
prediction_3 = predict_sentiment(user_text_3, model, vectorizer, class_labels)
print(f"Input: '{user_text_3}' -> {prediction_3}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
