# Problem 2: Twitter US Airline Sentiment Classification

In [None]:
!pip install gensim
!pip install contractions

In [2]:
import pandas as pd
import numpy as np
import re
import string
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import contractions
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+|#\w+|[^\w\s]', '', text)
    text = contractions.fix(text)
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    text = ' '.join([word for word in tokens if word.isalpha()])
    return text

In [5]:
w2v_model = api.load('word2vec-google-news-300')



In [6]:
def tweet_to_vector(tweet, w2v_model):
    words = tweet.split()
    word_vectors = []
    for word in words:
        if word in w2v_model:
            word_vectors.append(w2v_model[word])
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(w2v_model.vector_size)

In [7]:
df = pd.read_csv('Tweets.csv')
df = df[['airline_sentiment', 'text']]
print(df.head())

  airline_sentiment                                               text
0           neutral                @VirginAmerica What @dhepburn said.
1          positive  @VirginAmerica plus you've added commercials t...
2           neutral  @VirginAmerica I didn't today... Must mean I n...
3          negative  @VirginAmerica it's really aggressive to blast...
4          negative  @VirginAmerica and it's a really big bad thing...


In [10]:
df['clean_text'] = df['text'].apply(preprocess_text)
df['vector'] = df['clean_text'].apply(lambda x: tweet_to_vector(x, w2v_model))

X = np.array(df['vector'].tolist())
y = df['airline_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.7869


In [12]:
def predict_tweet_sentiment(model, w2v_model, tweet):
    processed = preprocess_text(tweet)
    vector = tweet_to_vector(processed, w2v_model).reshape(1, -1)
    prediction = model.predict(vector)[0]
    return prediction

In [13]:
example_tweet = 'I had a great experience with the airline today!'
predicted_sentiment = predict_tweet_sentiment(model, w2v_model, example_tweet)
print(f'Predicted Sentiment: {predicted_sentiment}')

Predicted Sentiment: positive
