Uploading kaggle.json file

In [None]:
# Create a Kaggle folder and move the file
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/

# Set permissions
!chmod 600 ~/.kaggle/kaggle.json


Importing Twitter Sentiment dataset

In [None]:
!kaggle datasets download -d kazanova/sentiment140


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 95% 77.0M/80.9M [00:00<00:00, 262MB/s]
100% 80.9M/80.9M [00:00<00:00, 264MB/s]


Importing the Dependencies

In [None]:
import pandas as pd  # For handling data
import numpy as np  # For numerical operations
import re  # For text cleaning
import nltk  # NLP processing
from nltk.corpus import stopwords  # Stopword removal
from nltk.tokenize import word_tokenize  # Tokenization
from sklearn.model_selection import train_test_split  # Train-test split
from sklearn.feature_extraction.text import TfidfVectorizer  # Convert text to numerical form
from sklearn.naive_bayes import MultinomialNB  # Naïve Bayes classifier
from sklearn.metrics import accuracy_score, classification_report  # Model evaluation

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Loading dataset into pandas**

In [None]:
import pandas as pd

# Load dataset (Make sure you've already downloaded it)
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)

# Rename columns for clarity
df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
df = df[['sentiment', 'text']]  # Keep only necessary columns

# Convert sentiment labels: 0 -> Negative, 4 -> Positive (change 4 to 1)
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})

# Display first few rows
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})


Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


Cleaning the dataset

In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords (only needed once)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z]", " ", text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize words
    text = [word for word in text if word not in stop_words]  # Remove stopwords
    return " ".join(text)

# Apply cleaning to dataset
df['text'] = df['text'].apply(clean_text)

# Show cleaned text samples
df.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,sentiment,text
0,0,switchfoot awww bummer shoulda got david carr ...
1,0,upset update facebook texting might cry result...
2,0,kenichan dived many times ball managed save re...
3,0,whole body feels itchy like fire
4,0,nationwideclass behaving mad see


 Split Data into Training & Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# Features (X) and Labels (y)
X = df['text']  # Tweets
y = df['sentiment']  # Sentiment (0 = Negative, 1 = Positive)

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check dataset sizes
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 1280000, Test size: 320000


Convert Text into TF-IDF Vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer (limiting to top 5000 features for efficiency)
vectorizer = TfidfVectorizer(max_features=5000)

# Fit & transform the text data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Check shape
print(f"Train TF-IDF shape: {X_train_tfidf.shape}, Test TF-IDF shape: {X_test_tfidf.shape}")


Train TF-IDF shape: (1280000, 5000), Test TF-IDF shape: (320000, 5000)


Train the Naïve Bayes Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train Naïve Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

print("Model training completed!")


Model training completed!


Evaluating Model Performance

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on test data
y_pred = nb_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Detailed classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 0.7547

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.76      0.76    159494
           1       0.76      0.75      0.75    160506

    accuracy                           0.75    320000
   macro avg       0.75      0.75      0.75    320000
weighted avg       0.75      0.75      0.75    320000



Testing Model on New Tweets

In [None]:
# Example tweets for testing
new_tweets = [
    "I love this product! It's amazing 😊",  # Positive
    "This is the worst experience ever. So disappointed! 😡",  # Negative
    "The movie was okay, not too great, not too bad.",  # Neutral (but our model predicts only positive/negative)
]

# Convert new tweets to TF-IDF format
new_tweets_tfidf = vectorizer.transform(new_tweets)

# Predict sentiment
predictions = nb_model.predict(new_tweets_tfidf)

# Print results
for tweet, sentiment in zip(new_tweets, predictions):
    print(f"Tweet: {tweet} → Sentiment: {'Positive' if sentiment == 1 else 'Negative'}")


Tweet: I love this product! It's amazing 😊 → Sentiment: Positive
Tweet: This is the worst experience ever. So disappointed! 😡 → Sentiment: Negative
Tweet: The movie was okay, not too great, not too bad. → Sentiment: Positive
