# Task 7: Introduction to Natural Language (Text) Processing

## Section 1: Setup and Sample Dataset

### **Task 1**: Import Libraries and Sample Data
*Instruction*: Import the necessary libraries and define a sample dataset for sentiment classification.

In [None]:
import pandas as pd
import numpy as np
import nltk
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('punkt')
nltk.download('stopwords')

# Sample data
data = {
    'text': [
        'I love this movie. It was fantastic!',
        'Terrible acting and horrible plot.',
        'An excellent film with great characters.',
        'Worst movie I have ever seen.',
        'Absolutely wonderful! A must-watch.',
        'It was okay, nothing special.',
        'Bad movie, waste of time.',
        'Pretty good, I liked it.',
        'Not great, but not terrible.',
        'Awful! Never again.'
    ],
    'label': [1, 0, 1, 0, 1, 1, 0, 1, 0, 0]  # 1 = positive, 0 = negative
}

df = pd.DataFrame(data)
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,text,label
0,I love this movie. It was fantastic!,1
1,Terrible acting and horrible plot.,0
2,An excellent film with great characters.,1
3,Worst movie I have ever seen.,0
4,Absolutely wonderful! A must-watch.,1


## Section 2: Text Preprocessing

### **Task 2**: Clean the Text

*Instruction*: Lowercase the text, remove punctuation, stopwords, and tokenize the sentences.


In [None]:
df = pd.DataFrame(data)  # Create DataFrame

def preprocess(text):
    """
    Lowercase, remove punctuation & stopwords, and tokenize text.

    Args:
        text (str): The input text.

    Returns:
        str: The preprocessed text.
    """
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(filtered_tokens)


df['cleaned'] = df['text'].apply(preprocess)  # Apply preprocessing
df[['text', 'cleaned']].head()  # Display results

## Section 3: Text Vectorization

### **Task 3**: Convert Text to Numerical Features

*Instruction*: Use both Bag of Words and TF-IDF vectorization to convert the cleaned text.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample cleaned documents (tokenized & cleaned as in previous step)
docs = [
    "natural language processing nlp fascinating field",
    "enables computers understand interpret generate human language",
    "applications include chatbots search engines translation tools"
]

# ----------------------------
# Bag of Words (BoW)
# ----------------------------
bow_vectorizer = CountVectorizer()
bow_features = bow_vectorizer.fit_transform(docs)

print("Bag of Words Feature Names:")
print(bow_vectorizer.get_feature_names_out())
print("BoW Matrix:\n", bow_features.toarray())

# ----------------------------
# TF-IDF Vectorization
# ----------------------------
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(docs)

print("\nTF-IDF Feature Names:")
print(tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_features.toarray())


Bag of Words Feature Names:
['applications' 'chatbots' 'computers' 'enables' 'engines' 'fascinating'
 'field' 'generate' 'human' 'include' 'interpret' 'language' 'natural'
 'nlp' 'processing' 'search' 'tools' 'translation' 'understand']
BoW Matrix:
 [[0 0 0 0 0 1 1 0 0 0 0 1 1 1 1 0 0 0 0]
 [0 0 1 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 1]
 [1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 0]]

TF-IDF Feature Names:
['applications' 'chatbots' 'computers' 'enables' 'engines' 'fascinating'
 'field' 'generate' 'human' 'include' 'interpret' 'language' 'natural'
 'nlp' 'processing' 'search' 'tools' 'translation' 'understand']
TF-IDF Matrix:
 [[0.         0.         0.         0.         0.         0.42339448
  0.42339448 0.         0.         0.         0.         0.32200242
  0.42339448 0.42339448 0.42339448 0.         0.         0.
  0.        ]
 [0.         0.         0.38988801 0.38988801 0.         0.
  0.         0.38988801 0.38988801 0.         0.38988801 0.29651988
  0.         0.         0.         0.   

## Section 4: Train a Classifier

### **Task 4**: Sentiment Classification with Naive Bayes

*Instruction*: Split the dataset, train a classifier using both feature sets, and evaluate the performance.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Sample data: Text and corresponding sentiment (1 for positive, 0 for negative)
data = {
    'text': [
        "I love this product, it's amazing!",  # Positive
        "Worst purchase I ever made.",         # Negative
        "Absolutely fantastic service!",       # Positive
        "I hate this, it's terrible.",         # Negative
        "Would buy again, highly recommend!",  # Positive
        "Not worth the money, very disappointed." # Negative
    ],
    'sentiment': [1, 0, 1, 0, 1, 0]  # 1=positive, 0=negative
}

# Convert to a DataFrame
df = pd.DataFrame(data)

# ----------------------------
# Split dataset into train and test sets
# ----------------------------
X = df['text']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# ----------------------------
# Feature extraction using Bag of Words (BoW)
# ----------------------------
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# ----------------------------
# Feature extraction using TF-IDF
# ----------------------------
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# ----------------------------
# Train Naive Bayes classifier
# ----------------------------
# Using BoW features
nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train)

# Using TF-IDF features
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)

# ----------------------------
# Predictions and Evaluation
# ----------------------------
# Predict using BoW model
y_pred_bow = nb_bow.predict(X_test_bow)
# Predict using TF-IDF model
y_pred_tfidf = nb_tfidf.predict(X_test_tfidf)

# Evaluate the performance
print("Naive Bayes with Bag of Words (BoW):")
print("Accuracy:", accuracy_score(y_test, y_pred_bow))
print(classification_report(y_test, y_pred_bow))

print("\nNaive Bayes with TF-IDF:")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print(classification_report(y_test, y_pred_tfidf))


Naive Bayes with Bag of Words (BoW):
Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2


Naive Bayes with TF-IDF:
Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Section 5: Mini Challenge – Classify Your Own Text

### **Task 5**:  User Input Prediction

*Instruction*: Write a function that allows the user to enter a text and receive a prediction from the trained model.


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Sample data: Text and corresponding sentiment (1 for positive, 0 for negative)
data = {
    'text': [
        "I love this product, it's amazing!",  # Positive
        "Worst purchase I ever made.",         # Negative
        "Absolutely fantastic service!",       # Positive
        "I hate this, it's terrible.",         # Negative
        "Would buy again, highly recommend!",  # Positive
        "Not worth the money, very disappointed." # Negative
    ],
    'sentiment': [1, 0, 1, 0, 1, 0]  # 1=positive, 0=negative
}

# Convert to a DataFrame
df = pd.DataFrame(data)

# ----------------------------
# Feature extraction using TF-IDF (or BoW)
# ----------------------------
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['text'])
y = df['sentiment']

# Train Naive Bayes classifier
nb = MultinomialNB()
nb.fit(X, y)

# ----------------------------
# Prediction function
# ----------------------------
def predict_sentiment(user_input):
    # Transform the user input using the trained TF-IDF vectorizer
    user_input_tfidf = tfidf_vectorizer.transform([user_input])

    # Predict the sentiment (1 for positive, 0 for negative)
    prediction = nb.predict(user_input_tfidf)

    if prediction == 1:
        return "Positive Sentiment"
    else:
        return "Negative Sentiment"

# ----------------------------
# Example of user input and prediction
# ----------------------------
user_text = input("Enter a text for sentiment prediction: ")
print(predict_sentiment(user_text))
