# Task 7: Introduction to Natural Language (Text) Processing

## Section 1: Setup and Sample Dataset

### **Task 1**: Import Libraries and Sample Data
*Instruction*: Import the necessary libraries and define a sample dataset for sentiment classification.

In [None]:
import pandas as pd
import numpy as np
import nltk
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('punkt')
nltk.download('stopwords')

# Sample data
data = {
    'text': [
        'I love this movie. It was fantastic!',
        'Terrible acting and horrible plot.',
        'An excellent film with great characters.',
        'Worst movie I have ever seen.',
        'Absolutely wonderful! A must-watch.',
        'It was okay, nothing special.',
        'Bad movie, waste of time.',
        'Pretty good, I liked it.',
        'Not great, but not terrible.',
        'Awful! Never again.'
    ],
    'label': [1, 0, 1, 0, 1, 1, 0, 1, 0, 0]  # 1 = positive, 0 = negative
}

df = pd.DataFrame(data)
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,text,label
0,I love this movie. It was fantastic!,1
1,Terrible acting and horrible plot.,0
2,An excellent film with great characters.,1
3,Worst movie I have ever seen.,0
4,Absolutely wonderful! A must-watch.,1


## Section 2: Text Preprocessing

### **Task 2**: Clean the Text

*Instruction*: Lowercase the text, remove punctuation, stopwords, and tokenize the sentences.


In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean_text(text):
    # Lowercasing
    text = text.lower()

    # Remove Punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = word_tokenize(text)

    # Remove Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    return tokens

# Example usage
text = "This is a sample sentence, with some punctuation! And some STOP words."
cleaned_tokens = clean_text(text)
print(cleaned_tokens)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## Section 3: Text Vectorization

### **Task 3**: Convert Text to Numerical Features

*Instruction*: Use both Bag of Words and TF-IDF vectorization to convert the cleaned text.


In [None]:
    from sklearn.feature_extraction.text import CountVectorizer
    import pandas as pd

    # Sample text data
    text_data = [
        "This is the first document.",
        "This is the second document.",
        "And this is the third one.",
        "Is this the first document?"
    ]

    # Create a CountVectorizer object
    vectorizer = CountVectorizer()

    # Fit the vectorizer on the text data
    vectorizer.fit(text_data)

    # Get the vocabulary (unique words)
    vocabulary = vectorizer.vocabulary_
    print(vocabulary)

    # Transform the text data into a matrix
    vector_matrix = vectorizer.transform(text_data)

    # Convert the matrix to a Pandas DataFrame for easier visualization
    df = pd.DataFrame(vector_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    print(df)

# TF-IDF


{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
   and  document  first  is  one  second  the  third  this
0    0         1      1   1    0       0    1      0     1
1    0         1      0   1    0       1    1      0     1
2    1         0      0   1    1       0    1      1     1
3    0         1      1   1    0       0    1      0     1


## Section 4: Train a Classifier

### **Task 4**: Sentiment Classification with Naive Bayes

*Instruction*: Split the dataset, train a classifier using both feature sets, and evaluate the performance.

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# --- 1. Data Preparation ---

# Sample dataset (replace with your actual data)
data = {
    'text': [
        "This movie was absolutely fantastic!", "I hated this movie.", "It was okay, nothing special.",
        "The acting was terrible.", "I loved this film!", "This was a great movie.",
        "The plot was confusing.", "I enjoyed it.", "Not a bad movie.",
        "This movie was boring."
    ],
    'sentiment': [
        'positive', 'negative', 'neutral', 'negative', 'positive',
        'positive', 'negative', 'positive', 'neutral', 'negative'
    ]
}

# Convert data to lists
text_data = data['text']
sentiment_data = data['sentiment']

# --- 2. Data Splitting ---

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    text_data, sentiment_data, test_size=0.2, random_state=42
)

# --- 3. Feature Extraction ---

# Using CountVectorizer (Bag-of-words)
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# --- 4. Model Training ---

# Create a Multinomial Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# --- 5. Model Prediction ---

# Make predictions on the test set
y_pred = model.predict(X_test_vectorized)

# --- 6. Model Evaluation ---

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.0
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       1.0
     neutral       0.00      0.00      0.00       1.0
    positive       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Section 5: Mini Challenge – Classify Your Own Text

### **Task 5**:  User Input Prediction

*Instruction*: Write a function that allows the user to enter a text and receive a prediction from the trained model.


In [None]:
import joblib
import pandas as pd

def predict_with_user_input(model_path, user_input):
    """
    Predicts a value based on user input using a loaded machine learning model.

    Args:
        model_path (str): Path to the trained model file (e.g., .pkl).
        user_input (str): The text input from the user.

    Returns:
        str: The prediction made by the model.
    """
    try:
        model = joblib.load(model_path) # Load the model

        # **Assumption:** The model expects a Pandas DataFrame as input.  This is a common structure for machine learning models.
        # **Important:** Replace this with the appropriate preprocessing and input format for YOUR specific model.
        user_input_df = pd.DataFrame([user_input], columns=['text'])  # Example: Assume a 'text' column

        # **Important:**  The following lines assume the model expects a Pandas DataFrame.  Adjust if your model needs something else.
        prediction = model.predict(user_input_df)[0] # Make the prediction
        return prediction
    except Exception as e:
        return f"Error during prediction: {e}"


# Example Usage:
model_file_path = "your_model.pkl"  # Replace with your model's file path

# Get user input (replace with your preferred input method, e.g., from an application)
user_input_text = input("Enter the text: ")

# Make the prediction
result = predict_with_user_input(model_file_path, user_input_text)

# Print the result
print(f"Prediction: {result}")

Enter the text: 2
Prediction: Error during prediction: [Errno 2] No such file or directory: 'your_model.pkl'
