In [86]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [87]:
# Download reuired nltk resources

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/suhas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/suhas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [88]:
# Sample sentence
sentence = "Suhas is learning POS tagging using Python."

# step: 1  Tokenize the sentence
tokens = word_tokenize(sentence)
print("Tokens:",tokens)

# step 2: Apply POS tagging
pos_tags = pos_tag(tokens)
print("\n POS Tags:",pos_tags)

Tokens: ['Suhas', 'is', 'learning', 'POS', 'tagging', 'using', 'Python', '.']

 POS Tags: [('Suhas', 'NNP'), ('is', 'VBZ'), ('learning', 'VBG'), ('POS', 'NNP'), ('tagging', 'VBG'), ('using', 'VBG'), ('Python', 'NNP'), ('.', '.')]


## Example of POST Tagging

In [89]:
import numpy as np
import pandas as pd
import nltk

In [90]:
# Download necessary resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/suhas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/suhas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [91]:
# Dataset
data = {
    "sentence": [
        "I love this product.", "This is a bad experience.", "The service was great.", 
        "I will never buy this again.", "Excellent quality and amazing service.", 
        "The food was terrible.", "Highly recommend this item.", "Very disappointed with the purchase.", 
        "The movie was fantastic.", "It was a complete waste of time.",
    ] * 20,
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0] * 20
}

# convert into dataframe
df = pd.DataFrame(data)

In [92]:
df

Unnamed: 0,sentence,label
0,I love this product.,1
1,This is a bad experience.,0
2,The service was great.,1
3,I will never buy this again.,0
4,Excellent quality and amazing service.,1
...,...,...
195,The food was terrible.,0
196,Highly recommend this item.,1
197,Very disappointed with the purchase.,0
198,The movie was fantastic.,1


In [93]:
# Tokenize and apply pos tagging
from nltk.tokenize import word_tokenize
from nltk import pos_tag

def tokenize_and_pos_tagging(sentence):
    tokens = word_tokenize(sentence)
    tags = pos_tag(tokens)
    return " ".join([f"{word}_{tag}" for word,tag in tags])

#Apply pos tagging
df['pos_tags'] = df['sentence'].apply(tokenize_and_pos_tagging)

    

In [94]:
df.head()

Unnamed: 0,sentence,label,pos_tags
0,I love this product.,1,I_PRP love_VBP this_DT product_NN ._.
1,This is a bad experience.,0,This_DT is_VBZ a_DT bad_JJ experience_NN ._.
2,The service was great.,1,The_DT service_NN was_VBD great_JJ ._.
3,I will never buy this again.,0,I_PRP will_MD never_RB buy_VB this_DT again_RB...
4,Excellent quality and amazing service.,1,Excellent_JJ quality_NN and_CC amazing_JJ serv...


In [95]:
# step 3: convert text to numerical features using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# split the dataset into featuers X and labels y

X = df['pos_tags']
y = df['label']

#convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer()
x_vectorized = vectorizer.fit_transform(X)

# split into training and testing sets

X_train,X_test,y_train,y_test = train_test_split(x_vectorized,y,test_size=0.2,random_state=42)

In [96]:
# Train a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
from sklearn.ensemble import RandomForestClassifier

# model = LogisticRegression()
model = RandomForestClassifier()
model.fit(X_train,y_train)

#Make Predications
y_pred = model.predict(X_test)


#calculate accuracy
accuracy = accuracy_score(y_test,y_pred)
print('accuracy:',accuracy)

report = classification_report(y_test,y_pred)
print(report)

accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        24

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



In [97]:
# Predict sentiment for new sentences

def predict_sentiment(sentence):
    pos_tagging = tokenize_and_pos_tagging(sentence)
    x_vectorized = vectorizer.transform([pos_tagging])
    prediction = model.predict(x_vectorized)[0]
    sentiment = "positive" if prediction == 1 else "negative"
    return sentiment

# Test new sentences
new_sentences = [
    "I am very happy with this service.", 
    "The product quality is awful.", 
    "Amazing experience!", 
    "I would not recommend this."
]

for sentence in new_sentences:
    print(f"Sentence: {sentence}")
    print(f"Predicted Sentiment: {predict_sentiment(sentence)}\n")

Sentence: I am very happy with this service.
Predicted Sentiment: positive

Sentence: The product quality is awful.
Predicted Sentiment: negative

Sentence: Amazing experience!
Predicted Sentiment: negative

Sentence: I would not recommend this.
Predicted Sentiment: positive

