In [None]:
!pip install nltk scikit-learn

# Import libraries
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

print("Environment ready!")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Environment ready!


In [None]:
# STEP 2: Create a small simple sentiment dataset

texts = [
    "I love this movie, it was amazing!",
    "This film was terrible, I hated it.",
    "What a great experience, I enjoyed every moment.",
    "This is the worst thing I have ever watched.",
    "Absolutely fantastic! Highly recommended.",
    "Really boring and bad acting.",
    "I am so happy with this product!",
    "I am very disappointed, this was a waste of time.",
    "This made my day, I feel wonderful!",
    "I regret buying this, very poor quality."
]

labels = [
    1,  # Positive
    0,  # Negative
    1,  # Positive
    0,  # Negative
    1,  # Positive
    0,  # Negative
    1,  # Positive
    0,  # Negative
    1,  # Positive
    0   # Negative
]

print("Dataset loaded! Total samples:", len(texts))


Dataset loaded! Total samples: 10


In [None]:
# STEP 3: Basic text cleaning

import re

def clean_text(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 3. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply cleaning to all texts
cleaned_texts = [clean_text(t) for t in texts]

cleaned_texts


['i love this movie it was amazing',
 'this film was terrible i hated it',
 'what a great experience i enjoyed every moment',
 'this is the worst thing i have ever watched',
 'absolutely fantastic highly recommended',
 'really boring and bad acting',
 'i am so happy with this product',
 'i am very disappointed this was a waste of time',
 'this made my day i feel wonderful',
 'i regret buying this very poor quality']

In [None]:
# STEP 4: Tokenization + Stopword removal + Lemmatization
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

print("All required NLTK data downloaded!")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # 1. Tokenize
    tokens = word_tokenize(text)

    # 2. Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # 3. Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

processed_texts = [preprocess(t) for t in cleaned_texts]

processed_texts


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


All required NLTK data downloaded!


[['love', 'movie', 'amazing'],
 ['film', 'terrible', 'hated'],
 ['great', 'experience', 'enjoyed', 'every', 'moment'],
 ['worst', 'thing', 'ever', 'watched'],
 ['absolutely', 'fantastic', 'highly', 'recommended'],
 ['really', 'boring', 'bad', 'acting'],
 ['happy', 'product'],
 ['disappointed', 'waste', 'time'],
 ['made', 'day', 'feel', 'wonderful'],
 ['regret', 'buying', 'poor', 'quality']]

In [None]:
#  convert tokens back to text (TF-IDF expects text, not tokens)
joined_texts = [" ".join(tokens) for tokens in processed_texts]

from sklearn.feature_extraction.text import TfidfVectorizer

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(joined_texts)

X.toarray()[:5]  # Show first 5 vectors


array([[0.        , 0.        , 0.57735027, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.        , 0.57735027, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.        , 0.57735027, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
      

In [None]:
# Train a Machine Learning Model

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

# Create the model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
accuracy
## accuracy is low due low dataset

0.3333333333333333

In [None]:
#Test the model with custom sentences

def predict_sentiment(text):
    # 1. Clean
    text = clean_text(text)
    # 2. Preprocess
    tokens = preprocess(text)
    # 3. Join back to string
    text_processed = " ".join(tokens)
    # 4. Transform with TF-IDF
    X_input = vectorizer.transform([text_processed])
    # 5. Predict
    prediction = model.predict(X_input)[0]

    return "Positive " if prediction == 1 else "Negative "

# Try som sentences
print(predict_sentiment("I really loved this movie!"))
print(predict_sentiment("This is the worst thing ever."))
print(predict_sentiment("I feel so happy today!"))
print(predict_sentiment("I am very disappointed."))


Positive 
Negative 
Positive 
Negative 
