In [1]:
!pip install gensim
!pip install nltk



In [2]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk


nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

def vectorize(tokens, w2v_model):
    valid_vecs = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(valid_vecs, axis=0) if valid_vecs else np.zeros(w2v_model.vector_size)


twitter_data = pd.read_csv('0000000000002747_test_twitter_x_test.csv')
twitter_data['Tokens'] = twitter_data['text'].apply(preprocess)


w2v_twitter = Word2Vec(twitter_data['Tokens'], vector_size=300, window=5, min_count=1, workers=4)
twitter_data['Vector'] = twitter_data['Tokens'].apply(lambda tokens: vectorize(tokens, w2v_twitter))


X_twitter = np.vstack(twitter_data['Vector'].values)
y_twitter = pd.factorize(twitter_data['airline_sentiment_gold'])[0]  # Encoding sentiments


X_twitter_train, X_twitter_test, y_twitter_train, y_twitter_test = train_test_split(X_twitter, y_twitter, test_size=0.2, random_state=42)
model_twitter = LogisticRegression(max_iter=1000, solver='liblinear', multi_class='ovr')
model_twitter.fit(X_twitter_train, y_twitter_train)
preds_twitter = model_twitter.predict(X_twitter_test)
accuracy_twitter = accuracy_score(y_twitter_test, preds_twitter)

print("USA Airline Sentiment Analysis Accuracy:", accuracy_twitter)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


USA Airline Sentiment Analysis Accuracy: 0.9959016393442623
