# SENTIMENT ANALYSIS WITH NLP Task 2

PERFORM SENTIMENT ANALYSIS ON A DATASET OF CUSTOMER REVIEWS USING TF-IDF VECTORIZATION AND LOGISTIC REGRESSION.

In [9]:
pip install pandas scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [11]:
import pandas as pd

# Create a sample dataset
data = {
    'review': [
        'I love this product! It works great.',
        'Terrible experience, waste of money.',
        'Absolutely fantastic service and support!',
        'Not what I expected. Very disappointed.',
        'It’s okay, not the best but not bad either.',
        'Worst product ever, do not buy!',
        'Highly recommend it. Excellent quality.',
        'The product broke after one use. Terrible.',
        'Very satisfied with my purchase.',
        'It was a decent product, could be better.'
    ],
    'sentiment': [1, 0, 1, 0, 0, 0, 1, 0, 1, 0]  # 1 = Positive, 0 = Negative
}

df = pd.DataFrame(data)


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1. Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=42)

# 2. TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 3. Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# 4. Predict & Evaluate
y_pred = model.predict(X_test_tfidf)

print("🔍 Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Classification Report:\n", classification_report(y_test, y_pred))


🔍 Accuracy: 0.6666666666666666
📊 Classification Report:
               precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


DELIVERABLE: A JUPYTER NOTEBOOK SHOWCASING PREPROCESSING, MODELING, AND SENTIMENT EVALUATION

In [20]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [22]:
# Sample dataset: Replace this with your CSV file if needed
data = {
    'review': [
        'I love this product! It works great.',
        'Terrible experience, waste of money.',
        'Absolutely fantastic service and support!',
        'Not what I expected. Very disappointed.',
        'It’s okay, not the best but not bad either.',
        'Worst product ever, do not buy!',
        'Highly recommend it. Excellent quality.',
        'The product broke after one use. Terrible.',
        'Very satisfied with my purchase.',
        'It was a decent product, could be better.'
    ],
    'sentiment': [1, 0, 1, 0, 0, 0, 1, 0, 1, 0]  # 1 = Positive, 0 = Negative
}

df = pd.DataFrame(data)
df.head()


Unnamed: 0,review,sentiment
0,I love this product! It works great.,1
1,"Terrible experience, waste of money.",0
2,Absolutely fantastic service and support!,1
3,Not what I expected. Very disappointed.,0
4,"It’s okay, not the best but not bad either.",0


In [24]:
# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply cleaning
df['clean_review'] = df['review'].apply(clean_text)
df.head()


Unnamed: 0,review,sentiment,clean_review
0,I love this product! It works great.,1,i love this product it works great
1,"Terrible experience, waste of money.",0,terrible experience waste of money
2,Absolutely fantastic service and support!,1,absolutely fantastic service and support
3,Not what I expected. Very disappointed.,0,not what i expected very disappointed
4,"It’s okay, not the best but not bad either.",0,its okay not the best but not bad either


In [26]:
X = df['clean_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [28]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [30]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [32]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       0.00      0.00      0.00         1

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
def predict_sentiment(text):
    text_clean = clean_text(text)
    text_vector = vectorizer.transform([text_clean])
    prediction = model.predict(text_vector)[0]
    return "Positive" if prediction == 1 else "Negative"

# Example
print(predict_sentiment("The product is awesome and worth the price!"))


Negative
