In [2]:
# Task 7: Sentiment Analysis on Tweets
# Tools: Python, NLTK, scikit-learn, pandas

import pandas as pd
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Download stopwords if not already
nltk.download('stopwords')
from nltk.corpus import stopwords

# ---- 1. Load Dataset ----
# Example: Small sample dataset (you can replace with real tweets CSV)
data = {
    "tweet": [
        "I love this product! It's amazing ",
        "Worst service ever. Totally disappointed.",
        "Feeling happy about the new update",
        "This app is so bad and useless",
        "Great work team, very helpful app",
        "I hate this. Waste of time!"
    ],
    "label": [1, 0, 1, 0, 1, 0]  # 1=Positive, 0=Negative
}

df = pd.DataFrame(data)

# ---- 2. Text Cleaning Function ----
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df['clean_tweet'] = df['tweet'].apply(clean_text)

# ---- 3. Vectorization (Bag of Words / TF-IDF) ----
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_tweet'])
y = df['label']

# ---- 4. Train-Test Split ----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ---- 5. Model Training ----
model = MultinomialNB()   # Naive Bayes
model.fit(X_train, y_train)

# ---- 6. Predictions ----
y_pred = model.predict(X_test)

# ---- 7. Evaluation ----
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kirut\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Accuracy: 0.5

Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
