<a href="https://colab.research.google.com/github/Rakesshreghu/datascience/blob/main/level_3_task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Step 1: Import libraries
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix


In [9]:
#  Download required NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [10]:
#  Load dataset
df = pd.read_csv("/content/3) Sentiment dataset.csv")

# Keep only required columns
df = df[['Text', 'Sentiment']]

print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (732, 2)


Unnamed: 0,Text,Sentiment
0,Enjoying a beautiful day at the park! ...,Positive
1,Traffic was terrible this morning. ...,Negative
2,Just finished an amazing workout! 💪 ...,Positive
3,Excited about the upcoming weekend getaway! ...,Positive
4,Trying out a new recipe for dinner tonight. ...,Neutral


In [11]:
# : Define text preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):

    text = text.lower()

    text = re.sub(r"http\S+|www\S+|https\S+", '', text)

    text = re.sub(r"@\w+|#\w+", '', text)

    text = re.sub(r"\d+", '', text)

    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = word_tokenize(text)

    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)


In [12]:
# Apply preprocessing to dataset
df['Clean_Text'] = df['Text'].apply(preprocess_text)

print("Before Processing:\n", df['Text'].head(3))
print("\nAfter Processing:\n", df['Clean_Text'].head(3))


Before Processing:
 0     Enjoying a beautiful day at the park!        ...
1     Traffic was terrible this morning.           ...
2     Just finished an amazing workout! 💪          ...
Name: Text, dtype: object

After Processing:
 0    enjoying beautiful day park
1       traffic terrible morning
2     finished amazing workout 💪
Name: Clean_Text, dtype: object


In [15]:
#  Convert text into numerical representation (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Clean_Text'])
y = df['Sentiment']

print("Feature Matrix Shape:", X.shape)


Feature Matrix Shape: (732, 2171)


In [16]:

df['Clean_Text'] = df['Text'].apply(preprocess_text)

print("Before Processing:\n", df['Text'].head(3))
print("\nAfter Processing:\n", df['Clean_Text'].head(3))


Before Processing:
 0     Enjoying a beautiful day at the park!        ...
1     Traffic was terrible this morning.           ...
2     Just finished an amazing workout! 💪          ...
Name: Text, dtype: object

After Processing:
 0    enjoying beautiful day park
1       traffic terrible morning
2     finished amazing workout 💪
Name: Clean_Text, dtype: object


In [17]:

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Clean_Text'])
y = df['Sentiment']

print("Feature Matrix Shape:", X.shape)


Feature Matrix Shape: (732, 2171)


In [18]:
#  Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 585
Testing samples: 147


In [19]:


# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


In [20]:
#  Evaluate models

# Logistic Regression
print("=== Logistic Regression Report ===")
y_pred_lr = lr_model.predict(X_test)
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix (Logistic Regression):\n", confusion_matrix(y_test, y_pred_lr))

# Naive Bayes
print("\n=== Naive Bayes Report ===")
y_pred_nb = nb_model.predict(X_test)
print(classification_report(y_test, y_pred_nb))
print("Confusion Matrix (Naive Bayes):\n", confusion_matrix(y_test, y_pred_nb))


=== Logistic Regression Report ===
                        precision    recall  f1-score   support

         Acceptance          0.00      0.00      0.00         2
           Admiration        0.00      0.00      0.00         1
        Admiration           0.00      0.00      0.00         1
         Affection           0.00      0.00      0.00         1
      Ambivalence            0.00      0.00      0.00         1
         Anger               0.00      0.00      0.00         1
        Anticipation         0.00      0.00      0.00         1
        Arousal              0.00      0.00      0.00         3
                  Awe        0.00      0.00      0.00         1
         Awe                 0.00      0.00      0.00         1
                  Bad        0.00      0.00      0.00         1
             Betrayal        0.00      0.00      0.00         2
        Betrayal             0.00      0.00      0.00         1
         Bitter              0.00      0.00      0.00         1
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
