In [2]:
# Step 1 – Load Dataset
import pandas as pd

# Upload your CSV file from Kaggle to Colab (or mount Google Drive)
# Dataset file: Tweets.csv
df = pd.read_csv("Tweets.csv")

# Keep only required columns
df = df[["airline_sentiment", "text"]]
df.head()


Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [7]:
import nltk
import shutil
import os

# Optional: remove the partially downloaded punkt_tab
data_dir = os.path.expanduser('~\\AppData\\Roaming\\nltk_data\\tokenizers\\punkt_tab')
if os.path.exists(data_dir):
    shutil.rmtree(data_dir)

nltk.download('punkt')
nltk.download('punkt_tab', force=True)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pugal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pugal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [8]:
import nltk
import string
import re
from nltk.stem.porter import PorterStemmer

from nltk.corpus import stopwords
ps = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    text = nltk.word_tokenize(text)
    y = []
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(ps.stem(i))
    return " ".join(y)

df["text_cleaned"] = df["text"].apply(clean_text)
df.head()


Unnamed: 0,airline_sentiment,text,text_cleaned
0,neutral,@VirginAmerica What @dhepburn said.,virginamerica dhepburn said
1,positive,@VirginAmerica plus you've added commercials t...,virginamerica plu 've ad commerci experi ... t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...,virginamerica n't today ... must mean need tak...
3,negative,@VirginAmerica it's really aggressive to blast...,virginamerica 's realli aggress blast obnoxi `...
4,negative,@VirginAmerica and it's a really big bad thing...,virginamerica 's realli big bad thing


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df["text_cleaned"]).toarray()
Y = df["airline_sentiment"]


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# --- Model 1: Multinomial Naive Bayes ---
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))

# --- Model 2: Random Forest ---
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Naive Bayes Accuracy: 0.7213114754098361
Random Forest Accuracy: 0.7534153005464481
