In [8]:
import numpy as np
import pandas as pd
import re
import nltk

In [9]:
# Load the dataset
df = pd.read_csv('Tweets.csv')

In [10]:
import pandas as pd
import re

In [11]:
def clean_text(text):
    if isinstance(text, str):  # Ensure the input is a string
        text = re.sub(r"@\w+", "", text)  # Remove @mentions
        text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
        text = re.sub(r"[^a-zA-Z0-9\s.,!?']", "", text)  # Remove special characters
        text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
        text = text.lower()  # Convert to lowercase
        return text
    return text  # Return as-is if not a string


In [12]:
df = pd.read_csv("Tweets.csv") 

In [13]:
df["cleaned_text"] = df["text"].apply(clean_text)

In [14]:
print(df[["text", "cleaned_text"]].head())
df.to_csv("cleaned_dataset.csv", index=False)

                                                text  \
0                @VirginAmerica What @dhepburn said.   
1  @VirginAmerica plus you've added commercials t...   
2  @VirginAmerica I didn't today... Must mean I n...   
3  @VirginAmerica it's really aggressive to blast...   
4  @VirginAmerica and it's a really big bad thing...   

                                        cleaned_text  
0                                         what said.  
1  plus you've added commercials to the experienc...  
2  i didn't today... must mean i need to take ano...  
3  it's really aggressive to blast obnoxious ente...  
4           and it's a really big bad thing about it  


In [15]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")
df["tokenized_text"] = df["cleaned_text"].apply(word_tokenize)

print(df[["cleaned_text", "tokenized_text"]].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                        cleaned_text  \
0                                         what said.   
1  plus you've added commercials to the experienc...   
2  i didn't today... must mean i need to take ano...   
3  it's really aggressive to blast obnoxious ente...   
4           and it's a really big bad thing about it   

                                      tokenized_text  
0                                    [what, said, .]  
1  [plus, you, 've, added, commercials, to, the, ...  
2  [i, did, n't, today, ..., must, mean, i, need,...  
3  [it, 's, really, aggressive, to, blast, obnoxi...  
4  [and, it, 's, a, really, big, bad, thing, abou...  


In [16]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

def tokenize_and_remove_stopwords(text):
    if isinstance(text, str):
        words = word_tokenize(text)  # Tokenization
        filtered_words = [word for word in words if word.lower() not in stop_words]  # Stopword removal
        return " ".join(filtered_words)  # Join back into a string
    return text

df["filtered_text"] = df["cleaned_text"].apply(tokenize_and_remove_stopwords)

print(df[["cleaned_text", "filtered_text"]].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                        cleaned_text  \
0                                         what said.   
1  plus you've added commercials to the experienc...   
2  i didn't today... must mean i need to take ano...   
3  it's really aggressive to blast obnoxious ente...   
4           and it's a really big bad thing about it   

                                       filtered_text  
0                                             said .  
1  plus 've added commercials experience ... tacky .  
2   n't today ... must mean need take another trip !  
3  's really aggressive blast obnoxious entertain...  
4                            's really big bad thing  


In [17]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

def apply_stemming(text):
    if isinstance(text, str):
        words = word_tokenize(text)  # Tokenize text
        stemmed_words = [stemmer.stem(word) for word in words]  # Apply stemming
        return " ".join(stemmed_words)  # Join words back
    return text

df["stemmed_text"] = df["filtered_text"].apply(apply_stemming)

print(df[["filtered_text", "stemmed_text"]].head())

                                       filtered_text  \
0                                             said .   
1  plus 've added commercials experience ... tacky .   
2   n't today ... must mean need take another trip !   
3  's really aggressive blast obnoxious entertain...   
4                            's really big bad thing   

                                        stemmed_text  
0                                             said .  
1             plu 've ad commerci experi ... tacki .  
2     n't today ... must mean need take anoth trip !  
3  's realli aggress blast obnoxi entertain guest...  
4                            's realli big bad thing  


In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(df["stemmed_text"])  # Apply on stemmed text

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df["stemmed_text"])  # Apply on stemmed text

count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print(tfidf_df.head())

    00  000  000ft  000lb  0011  0016  0162389030167  0162424965446  \
0  0.0  0.0    0.0    0.0   0.0   0.0            0.0            0.0   
1  0.0  0.0    0.0    0.0   0.0   0.0            0.0            0.0   
2  0.0  0.0    0.0    0.0   0.0   0.0            0.0            0.0   
3  0.0  0.0    0.0    0.0   0.0   0.0            0.0            0.0   
4  0.0  0.0    0.0    0.0   0.0   0.0            0.0            0.0   

   0162431184663  0167560070877  ...  zigzag  zip  zipper  zombi  zone  zoom  \
0            0.0            0.0  ...     0.0  0.0     0.0    0.0   0.0   0.0   
1            0.0            0.0  ...     0.0  0.0     0.0    0.0   0.0   0.0   
2            0.0            0.0  ...     0.0  0.0     0.0    0.0   0.0   0.0   
3            0.0            0.0  ...     0.0  0.0     0.0    0.0   0.0   0.0   
4            0.0            0.0  ...     0.0  0.0     0.0    0.0   0.0   0.0   

   zrh  zuke  zurich  zurichnew  
0  0.0   0.0     0.0        0.0  
1  0.0   0.0     0.0    

In [21]:
from sklearn.model_selection import train_test_split

# Define features (X) and target labels (y)
X = df["stemmed_text"]  # Using stemmed text as input
y = df["airline_sentiment"]  # Sentiment labels (positive, negative, neutral)

# Split the dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show dataset sizes
print(f"Training Set: {len(X_train)} samples")
print(f"Testing Set: {len(X_test)} samples")

Training Set: 11712 samples
Testing Set: 2928 samples


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Convert text into TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df["stemmed_text"])

# Define target labels
y = df["airline_sentiment"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train a Naïve Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Detailed classification report
print(classification_report(y_test, y_pred))

Model Accuracy: 0.70
              precision    recall  f1-score   support

    negative       0.69      0.99      0.81      1889
     neutral       0.72      0.14      0.24       580
    positive       0.90      0.19      0.32       459

    accuracy                           0.70      2928
   macro avg       0.77      0.44      0.46      2928
weighted avg       0.73      0.70      0.62      2928

