In [2]:
!pip install gensim

import pandas as pd
import numpy as np
import re
import string
import nltk
import zipfile
import gensim.downloader as api

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


from google.colab import drive
drive.mount('/content/drive')


zip_path = '/content/drive/MyDrive/Colab Notebooks/Tweets.csv.zip'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    print("Contents of ZIP:", zip_ref.namelist())  # Check inside ZIP
    zip_ref.extractall('/content/')  # Extracts to /content/


df = pd.read_csv('/content/Tweets.csv')[['airline_sentiment', 'text']]
df.dropna(inplace=True)


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_tweet(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@[\w]+", '', text)
    text = re.sub(r"#[\w]+", '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    contractions = {"don't": "do not", "can't": "cannot", "i'm": "i am", "it's": "it is"}
    for k, v in contractions.items():
        text = text.replace(k, v)

    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and w.isalpha()]
    return tokens

df['tokens'] = df['text'].apply(preprocess_tweet)


w2v = api.load("glove-wiki-gigaword-100")  # 100D embedding


def get_vector(tokens):
    vectors = [w2v[word] for word in tokens if word in w2v]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

df['vector'] = df['tokens'].apply(get_vector)


X = np.vstack(df['vector'].values)
y = df['airline_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000, multi_class='multinomial')
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
print("✅ Model Accuracy:", accuracy_score(y_test, y_pred))


def predict_tweet_sentiment(model, w2v_model, tweet):
    tokens = preprocess_tweet(tweet)
    vector = get_vector(tokens).reshape(1, -1)
    return model.predict(vector)[0]


example = "Flight was delayed but staff was very helpful!"
print("Prediction:", predict_tweet_sentiment(model, w2v, example))




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Contents of ZIP: ['Tweets.csv']




✅ Model Accuracy: 0.7568306010928961
Prediction: negative
