In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample dataset
data = {
    'text': [
        "I love this product!", 
        "This is the worst thing I have ever bought.",
        "Absolutely fantastic service!", 
        "I am not happy with this item.",
        "The food was great!", 
        "Terrible experience, will not come back.",
        "Amazing performance by the actors.", 
        "Not worth the money.",
        "I am extremely satisfied with the results.", 
        "Very disappointing."
    ],
    'sentiment': [
        'positive', 
        'negative', 
        'positive', 
        'negative',
        'positive', 
        'negative', 
        'positive', 
        'negative',
        'positive', 
        'negative'
    ]
}

df = pd.DataFrame(data)
df.head(10)

[nltk_data] Downloading package punkt to /Users/binodrai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/binodrai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/binodrai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,sentiment
0,I love this product!,positive
1,This is the worst thing I have ever bought.,negative
2,Absolutely fantastic service!,positive
3,I am not happy with this item.,negative
4,The food was great!,positive
5,"Terrible experience, will not come back.",negative
6,Amazing performance by the actors.,positive
7,Not worth the money.,negative
8,I am extremely satisfied with the results.,positive
9,Very disappointing.,negative


In [2]:
# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['text'] = df['text'].apply(preprocess_text)
df.head(10)

Unnamed: 0,text,sentiment
0,love product,positive
1,worst thing ever bought,negative
2,absolutely fantastic service,positive
3,happy item,negative
4,food great,positive
5,terrible experience come back,negative
6,amazing performance actor,positive
7,worth money,negative
8,extremely satisfied result,positive
9,disappointing,negative


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming df['text'] contains your text data and df['sentiment'] contains the labels
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['sentiment']

In [4]:
X

<10x26 sparse matrix of type '<class 'numpy.float64'>'
	with 26 stored elements in Compressed Sparse Row format>

In [5]:
y

0    positive
1    negative
2    positive
3    negative
4    positive
5    negative
6    positive
7    negative
8    positive
9    negative
Name: sentiment, dtype: object

In [6]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [7]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.3333333333333333
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         2
    positive       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# Example of new text data
new_texts = [
    "The design of this product is very stylish and modern.",
    "The delivery took too long, very frustrating.",
    "Customer support was very helpful and resolved my issue quickly.",
    "I am not impressed with the durability of this item.",
    "An exceptional product, exceeded all my expectations!"
]

# Preprocess and vectorize the new text data
new_texts = [preprocess_text(text) for text in new_texts]
new_X = vectorizer.transform(new_texts)

# Make predictions
new_predictions = model.predict(new_X)
for text, sentiment in zip(new_texts, new_predictions):
    print(f'Text: {text}\nPredicted Sentiment: {sentiment}\n')

Text: design product stylish modern
Predicted Sentiment: positive

Text: delivery took long frustrating
Predicted Sentiment: positive

Text: customer support helpful resolved issue quickly
Predicted Sentiment: positive

Text: impressed durability item
Predicted Sentiment: negative

Text: exceptional product exceeded expectation
Predicted Sentiment: positive

