Getting Started

In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Tweets.csv')

# Display the first few rows to check the data
df.head()


Unnamed: 0,tweet_id,sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,review,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [5]:
# Select only the necessary columns
df = df[['review', 'sentiment']]

# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values if any
df = df.dropna()

# Clean the text data
import string

def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove mentions (e.g., @user)
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags (e.g., #hashtag)
    text = re.sub(r'#\w+', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the clean_text function to the review column
df['review'] = df['review'].apply(clean_text)

# Display the cleaned data
df.head()


review       0
sentiment    0
dtype: int64


Unnamed: 0,review,sentiment
0,virginamerica what dhepburn said,neutral
1,virginamerica plus youve added commercials to ...,positive
2,virginamerica i didnt today must mean i need t...,neutral
3,virginamerica its really aggressive to blast o...,negative
4,virginamerica and its a really big bad thing a...,negative


In [6]:
# Map sentiment labels to numerical values
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment'] = df['sentiment'].map(sentiment_mapping)

# Verify the mapping
df.head()


Unnamed: 0,review,sentiment
0,virginamerica what dhepburn said,1
1,virginamerica plus youve added commercials to ...,2
2,virginamerica i didnt today must mean i need t...,1
3,virginamerica its really aggressive to blast o...,0
4,virginamerica and its a really big bad thing a...,0


In [8]:
import re

# List of airline names to remove
airlines = ['americanair', 'united', 'southwestair', 'delta', 'jetblue', 'usairways', 'virginamerica']

def remove_airline_names(text):
    for airline in airlines:
        text = re.sub(r'\b' + re.escape(airline) + r'\b', '', text, flags=re.IGNORECASE)
    return text

# Apply the function to remove airline names
df['review'] = df['review'].apply(remove_airline_names)

# Verify the changes
df.head()


Unnamed: 0,review,sentiment
0,what dhepburn said,1
1,plus youve added commercials to the experienc...,2
2,i didnt today must mean i need to take anothe...,1
3,its really aggressive to blast obnoxious ente...,0
4,and its a really big bad thing about it,0


In [9]:
import string

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize
    words = text.split()
    return ' '.join(words)

# Apply the function to preprocess the reviews
df['review'] = df['review'].apply(preprocess_text)

# Verify the changes
df.head()


Unnamed: 0,review,sentiment
0,what dhepburn said,1
1,plus youve added commercials to the experience...,2
2,i didnt today must mean i need to take another...,1
3,its really aggressive to blast obnoxious enter...,0
4,and its a really big bad thing about it,0


In [10]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=42, stratify=df['sentiment'])

# Verify the split
print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))


Training set size: 10248
Testing set size: 4392


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=13000, stop_words='english')),
    ('smote', SMOTE(random_state=42)),
    ('classifier', MultinomialNB())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Generate a classification report
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)


Accuracy: 0.7645719489981785
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      2753
           1       0.59      0.47      0.52       930
           2       0.67      0.70      0.68       709

    accuracy                           0.76      4392
   macro avg       0.70      0.68      0.69      4392
weighted avg       0.76      0.76      0.76      4392

[[2424  205  124]
 [ 370  436  124]
 [ 114   97  498]]


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)


In [23]:
print(len(vectorizer.get_feature_names_out()))


12501


In [24]:
print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")


Vocabulary size: 12501
