In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
nltk.download('wordnet')
nltk.download('stopwords')

# NOTE: Ensure you have downloaded the required NLTK resources:
# ```python
# import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')
# ```

# # --- 1. GPU/Device Setup ---
# **PURPOSE:** Check for CUDA availability to utilize GPU acceleration, defaulting to CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")
# ----------------------------------------------------------------------

# # --- Dataset Definition ---
# --- 1. Data Setup (42 Examples: 30 Original + 12 Easy) ---
data = {
    "text": [
        # --- Original Varied Examples (30) ---
        "I love this product, it's perfect and exceeded my expectations!",
        "This application is absolutely terrible and unusable, a complete waste of time.",
        "It works fine most of the time, no major issues, just average performance.",
        "I’m so disappointed with the lack of features and constant bugs.",
        "Absolutely fantastic experience, top-notch support and incredibly quick resolution!",
        "Horrible service! I waited over an hour for a response and got no help.",
        "Not bad at all, could be better but it serves its basic purpose well.",
        "Worst thing ever, I'm canceling my subscription right now, I'm furious.",
        "Great help from support, they were very prompt, efficient, and friendly.",
        "Okay I guess, nothing special about it, quite neutral actually.",
        "The service was bad and my issue wasn't fixed.",
        "I have no opinion on the matter, it just exists.",
        "The user interface is clean, easy to navigate, and highly intuitive.",
        "It crashes every time I open the settings menu—completely broken software.",
        "It performs the core function, but the load times are truly unacceptable.",
        "I'm cautiously optimistic about the new features; they seem promising.",
        "I'm giving this a neutral score because I haven't used it enough to form an opinion.",
        "The price is a bit high for what it offers, making it a marginal value.",
        "Honestly, it's the best software update I've seen all year. Flawless!",
        "It was merely adequate; I encountered several minor inconveniences but nothing major.",
        "The customer service representative was rude, arrogant, and unhelpful.",
        "I found a bug, but otherwise, the experience was quite positive and speedy.",
        "This is highly functional, completely reliable, and I recommend it to everyone.",
        "I am so angry; the data I spent hours collecting was completely wiped out by the crash.",
        "The setup process was slightly confusing, leading to some early frustration.",
        "After a few hours of tinkering, it turned out to be exactly what I needed. Solid purchase.",
        "It's loud, bulky, and poorly designed. I regret this purchase.",
        "I was pleasantly surprised by the quality, which was much better than I expected.",
        "The documentation is non-existent, making it impossible to debug any problems.",
        "It’s totally fine, not the best, but I can't complain for the low price.",
        "This is amazing.",                                  # 1.0
        "I hate this.",                                     # -1.0
        "It works perfectly.",                              # 1.0
        "A total failure.",                                 # -1.0
        "I am so happy with the results.",                  # 1.0
        "This product is trash.",                           # -1.0
        "Excellent.",                                       # 1.0
        "Worst ever.",                                      # -1.0
        "Simply the best purchase.",                        # 1.0
        "A complete disaster.",                             # -1.0
        "I'm thrilled!",                                    # 1.0
        "Total waste of money.",                            # -1.0
    ],
    "label": [
        1.0, -0.9, 0.3, -0.85, 1.0, -1.0, 0.4, -1.0, 0.85, 0.0, -0.8, 0.0,
        0.95, -0.95, -0.5, 0.55, 0.0, -0.25, 1.0, 0.1, -0.75, 0.6, 0.9, -0.99,
        -0.4, 0.7, -0.8, 0.8, -0.7, 0.25,
        1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0
    ]
}
df = pd.DataFrame(data)

print(f"Total messages loaded: {len(df)}")
print("-" * 50)



Using device: cuda

Total messages loaded: 42
--------------------------------------------------


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\swaro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
# # --- 2. Text Preprocessing: Lemmatization and N-grams ---

# **Parameter:** Lemmatizer initialization (NLTK WordNet)
lemmatizer = WordNetLemmatizer()
# **Parameter:** Define common English stop words for filtering
stop_words = set(stopwords.words('english'))

def lemmatize_tokenizer(text):
    """
    Custom tokenizer for TfidfVectorizer.
    - **Cleaning:** Removes punctuation and converts to lowercase.
    - **Filtering:** Removes common stop words and single-letter tokens.
    - **Lemmatization:** Reduces words to their base or root form (e.g., 'waiting' -> 'wait').
    """
    original_text = text
    
    # 1. Cleaning and Lowercasing
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # 2. Tokenization, Stop Word Removal, and Lemmatization
    tokens_filtered = []
    for w in text.split():
        if w not in stop_words and len(w) > 1: # Filter common stop words and single letters
            # **Lemmatization:** Assuming verb tense ('v') for aggressive normalization
            tokens_filtered.append(lemmatizer.lemmatize(w, pos='v'))
            
    # Detailed Print Statements for NLP Concept Stages:
    if len(df['text']) > 0 and original_text == df['text'].iloc[0]:
        print(f" NLP Pipeline Example (First Message): '{original_text}'")
        print(f"  - **Initial Tokens:** {text.split()[:10]}...")
        print(f"  - **Lemmatization & Filtering (Unigrams):** {tokens_filtered[:10]}...")
        print("-" * 50)
        
    return tokens_filtered

# 1. Initialize the vectorizer
# **HYPERPARAMETER/TOOL:** TfidfVectorizer (Term Frequency-Inverse Document Frequency)
vectorizer = TfidfVectorizer(
    # **Hyperparameter:** max_features - Limits vocabulary size, managing model complexity.
    max_features=1000, 
    # **Parameter:** tokenizer - Uses our custom function for cleaning/lemmatization.
    tokenizer=lemmatize_tokenizer,
    # **Hyperparameter:** ngram_range - Includes single words (1), two-word phrases (2), and three-word phrases (3).
    # This captures **context** like "not bad" or "waste of time".
    ngram_range=(1, 3), 
    preprocessor=None 
)

# 2. **Vectorization:** Fit and transform text data to numerical features
X = vectorizer.fit_transform(df["text"]).toarray()
y = df["label"].values

# Detailed Print Statement for Vectorization
print(" **Vectorization (TF-IDF) Concept with N-grams:**")
feature_names = vectorizer.get_feature_names_out()
print(f"  - **Final Vocabulary Size (Input Dim):** {X.shape[1]}")
print(f"  - **Example Vocabulary (Features):**")
example_features = [f for f in feature_names if len(f.split()) > 1][:3] + [f for f in feature_names if len(f.split()) == 1][:3]
print(f"    - N-grams Examples: {example_features}")

# Example features from the first message
sample_vector = X[0]
non_zero_indices = np.nonzero(sample_vector)[0]
sample_features = {feature_names[i]: sample_vector[i] for i in non_zero_indices if len(feature_names[i].split()) > 1}
print(f"  - **Example Bigram/Trigram Features in First Message:** {sample_features}")
print("-" * 50)

# 3. **Data Split**
# **Hyperparameter:** test_size=0.2 (20% of data reserved for testing/evaluation)
# **Parameter:** random_state=42 (Ensures split is reproducible)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

INPUT_DIM = X_train.shape[1]
print(f" **Data Split and Dimension:**")
print(f"  - Training data shape: {X_train.shape}")
print(f"  - Testing data shape: {X_test.shape}")
print("-" * 50)


 **Vectorization (TF-IDF) Concept with N-grams:**
  - **Final Vocabulary Size (Input Dim):** 515
  - **Example Vocabulary (Features):**
    - N-grams Examples: ['absolutely fantastic', 'absolutely fantastic experience', 'absolutely terrible', 'absolutely', 'actually', 'adequate']
  - **Example Bigram/Trigram Features in First Message:** {'exceed expectations': 0.29098067449497644, 'love product': 0.29098067449497644, 'love product perfect': 0.29098067449497644, 'perfect exceed': 0.29098067449497644, 'perfect exceed expectations': 0.29098067449497644, 'product perfect': 0.29098067449497644, 'product perfect exceed': 0.29098067449497644}
--------------------------------------------------
 **Data Split and Dimension:**
  - Training data shape: (33, 515)
  - Testing data shape: (9, 515)
--------------------------------------------------


In [23]:

# # --- 3. Neural Network Definition and Setup ---

class SentimentNet(nn.Module):
    """
    Simple Feedforward Network for Sentiment Regression.
    - Takes a sparse TF-IDF vector as input.
    - Outputs a single scalar sentiment score between -1.0 and 1.0.
    """
    # **Parameter:** input_dim - The size of the input vector (vocabulary size from TF-IDF).
    # **Hyperparameter:** hidden_dims_list - List defining hidden layer node counts.
    def __init__(self, input_dim, hidden_dims_list):
        super(SentimentNet, self).__init__()
        
        layer_dims = [input_dim] + hidden_dims_list + [1]
        self.layers = nn.ModuleList()
        
        for i in range(len(layer_dims) - 1):
            # **Component:** nn.Linear (Fully Connected Layer)
            self.layers.append(nn.Linear(layer_dims[i], layer_dims[i+1]))
            
    def forward(self, x):
        # Pass through hidden layers
        for layer in self.layers[:-1]:
            # **Activation Function (Hidden Layers):** F.relu (Rectified Linear Unit)
            x = F.relu(layer(x))
            
        # **Activation Function (Output Layer):** torch.tanh (Hyperbolic Tangent)
        # This function squashes the final output into the desired [-1, 1] range.
        return torch.tanh(self.layers[-1](x))

# **Hyperparameter:** Hidden Layer Sizes (Architecture)
HIDDEN_DIMS = [128, 64] # Two hidden layers
model = SentimentNet(INPUT_DIM, HIDDEN_DIMS)

print(" **Model Architecture and Hyperparameters:**")
print(f"  - **Input Dimension (Features):** {INPUT_DIM}")
print(f"  - **Hidden Layers (Nodes):** {HIDDEN_DIMS}")
print("-" * 50)



 **Model Architecture and Hyperparameters:**
  - **Input Dimension (Features):** 515
  - **Hidden Layers (Nodes):** [128, 64]
--------------------------------------------------


In [24]:
# # --- 4. Training Setup and Loop ---

# **Hyperparameter:** EPOCHS - Number of complete passes over the training data.
EPOCHS = 2000
# Convert and move data to the selected device (CPU/GPU)
X_train_tensor = torch.FloatTensor(X_train).to(device)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1).to(device) 

model.to(device)

# **Loss Function (Criterion):** nn.MSELoss (Mean Squared Error)
# Suitable for regression tasks where we predict a continuous score.
criterion = nn.MSELoss()
# **Optimizer:** optim.Adam (Adaptive Moment Estimation)
# **Hyperparameter:** lr (Learning Rate) - Controls the step size of weight updates.
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Training Loop
loss_history = []
print(f" **Starting training for {EPOCHS} epochs on {device}...**")

for epoch in range(EPOCHS):
    model.train() # Set model to training mode
    optimizer.zero_grad() # Clear gradients from previous iteration

    output = model(X_train_tensor)
    loss = criterion(output, y_train_tensor)
    
    loss.backward() # Compute gradient of the loss
    optimizer.step() # Update model weights
    
    loss_history.append(loss.item())
    
    if (epoch + 1) % 20 == 0:
        print(f"  - Epoch [{epoch+1}/{EPOCHS}], Loss: {loss.item():.4f}")

print(" **Training complete.**")
print(f"  - Final training loss: {loss_history[-1]:.4f}")
print("-" * 50)


 **Starting training for 2000 epochs on cuda...**
  - Epoch [20/2000], Loss: 0.0220
  - Epoch [40/2000], Loss: 0.0097
  - Epoch [60/2000], Loss: 0.0037
  - Epoch [80/2000], Loss: 0.0025
  - Epoch [100/2000], Loss: 0.0011
  - Epoch [120/2000], Loss: 0.0004
  - Epoch [140/2000], Loss: 0.0001
  - Epoch [160/2000], Loss: 0.0001
  - Epoch [180/2000], Loss: 0.0000
  - Epoch [200/2000], Loss: 0.0000
  - Epoch [220/2000], Loss: 0.0000
  - Epoch [240/2000], Loss: 0.0000
  - Epoch [260/2000], Loss: 0.0000
  - Epoch [280/2000], Loss: 0.0000
  - Epoch [300/2000], Loss: 0.0000
  - Epoch [320/2000], Loss: 0.0000
  - Epoch [340/2000], Loss: 0.0000
  - Epoch [360/2000], Loss: 0.0000
  - Epoch [380/2000], Loss: 0.0000
  - Epoch [400/2000], Loss: 0.0000
  - Epoch [420/2000], Loss: 0.0000
  - Epoch [440/2000], Loss: 0.0000
  - Epoch [460/2000], Loss: 0.0000
  - Epoch [480/2000], Loss: 0.0000
  - Epoch [500/2000], Loss: 0.0000
  - Epoch [520/2000], Loss: 0.0000
  - Epoch [540/2000], Loss: 0.0000
  - Epoch

In [29]:

# # --- 5. Inference/Testing ---
model.eval() # Set model to evaluation mode (disables dropout, etc.)

# Example texts
new_text_pos = "The like how things are going perfectlyly in the company and I am thrilled"
new_text_neg = "I am not upset with how my device is getting restarted and not charging fast!"
new_text_neu = "The sky is blue and the night is fresh."


def predict_sentiment(text, vectorizer, model, device):
    """Helper function to preprocess and predict sentiment."""
    # The new text must be transformed using the *same* fitted vectorizer.
    new_vec = vectorizer.transform([text]).toarray()
    new_tensor = torch.FloatTensor(new_vec).to(device)

    with torch.no_grad(): # Disable gradient calculation for efficiency
        prediction = model(new_tensor).item()
    return prediction

# Get and print predictions
prediction_pos = predict_sentiment(new_text_pos, vectorizer, model, device)
prediction_neg = predict_sentiment(new_text_neg, vectorizer, model, device)
prediction_neu = predict_sentiment(new_text_neu, vectorizer, model, device)

print(" **Inference Predictions (Score range: -1.0 to 1.0):**")

print(f"\n  Message: '{new_text_pos}'")
print(f"  Predicted Sentiment: {prediction_pos:.4f} (Expected: Positive)")

print(f"\n  Message: '{new_text_neg}'")
print(f"  Predicted Sentiment: {prediction_neg:.4f} (Expected: Negative)")

print(f"\n  Message: '{new_text_neu}'")
print(f"  Predicted Sentiment: {prediction_neu:.4f} (Expected: Neutral/Zero)")
print("-" * 50)

 **Inference Predictions (Score range: -1.0 to 1.0):**

  Message: 'The like how things are going perfectlyly in the company and I am thrilled'
  Predicted Sentiment: 0.9978 (Expected: Positive)

  Message: 'I am not upset with how my device is getting restarted and not charging fast!'
  Predicted Sentiment: -0.9148 (Expected: Negative)

  Message: 'The sky is blue and the night is fresh.'
  Predicted Sentiment: -0.1030 (Expected: Neutral/Zero)
--------------------------------------------------
