# Preparing Data for Sentiment Analysis

In [None]:
import pandas as pd 
df = pd.read_csv('Tweets.csv') 
df.head(5) 

In [None]:
pd.set_option("max_colwidth", None) 
examples_idx = df.sample(5).index # [1106, 4860, 6977, 8884, 9108] 
df_sample = df.loc[examples_idx] 

## Traditional NLP pre-processing

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import re 
import spacy 

nlp = spacy.load("en_core_web_sm") 

def clean_text(text): 
    text = re.sub(r'@\w+|#\w+|https?://\S+', '', text) 
    text = re.sub(r'[^\w\s]', '', text) 
    return text.lower() 

df_sample['cleaned_text'] = df_sample['text'].apply(clean_text) 
df_sample[["text", "cleaned_text"]] 

In [None]:
def tokenize_and_remove_stopwords(row): 
    doc = nlp(row['cleaned_text']) 
    all_tokens = [token.text for token in doc] 
    tokens_without_stop = [token.text for token in doc if not token.is_stop] 
    processed_text = ' '.join(tokens_without_stop) 
    row['all_text_tokens'] = all_tokens 
    row['without_stop_words_tokens'] = tokens_without_stop 
    row['processed_text'] = processed_text 

    return row 

df_sample = df_sample.apply(tokenize_and_remove_stopwords, axis=1) 
df_sample[['cleaned_text', 'all_text_tokens', 'without_stop_words_tokens', 'processed_text']] 

In [None]:
def lemmatize_text(text): 

    doc = nlp(text) 
    lemmatized = [token.lemma_ for token in doc] 
    return ' '.join(lemmatized) 

df_sample['final_text'] = df_sample['processed_text'].apply(lemmatize_text) 
df_sample[['processed_text', 'final_text']] 

## GenAI for data augmentation

In [None]:
import matplotlib.pyplot as plt 
from datetime import datetime 

sentiment_by_airline = df.groupby(['airline', 'airline_sentiment']).size().unstack().fillna(0) 
plt.figure(figsize=(14, 6)) 
sentiment_by_airline.plot(kind='bar', stacked=True, color=['red', 'yellow', 'green']) 
plt.title('Sentiment Distribution by Airline') 
plt.xlabel('Airline') 
plt.ylabel('Number of Tweets') 
plt.xticks(rotation=45) 
plt.legend(title='Sentiment') 
plt.tight_layout() 
plt.show() 

In [None]:
df['airline_sentiment'].value_counts() 

In [None]:
from sklearn.utils import resample

negative = df[df.airline_sentiment == 'negative'] 
neutral = df[df.airline_sentiment == 'neutral'] 
positive = df[df.airline_sentiment == 'positive'] 
negative_downsampled = resample(negative, n_samples=len(positive)) 

df_downsampled = pd.concat([negative_downsampled, neutral, positive])   

In [None]:
from transformers import pipeline 

generator = pipeline('text-generation', model='distilgpt2') 

def augment_text(text, augment_times=2): 
    augmented_texts = [] 
    for _ in range(augment_times): 
        generated = generator(text, max_length=60, num_return_sequences=1) 
        new_text = generated[0]['generated_text'].strip() 
        augmented_texts.append(new_text) 

    return augmented_texts 

seed_text = "Fantastic airline service on this flight. My favorite part of the flight was" 
augmented_examples = augment_text(seed_text) 

def remove_extra_spaces(text): 
    words = text.split() 
    return ' '.join(words) 

for example in augmented_examples: 
    print("------\n", remove_extra_spaces(example)) 

In [None]:
augmented_data = pd.DataFrame({ 
    'text': augmented_examples, 
    'airline_sentiment': ['positive'] * len(augmented_examples) 
}) 

df_augmented = pd.concat([df, augmented_data], ignore_index=True) 

# Performing sentiment analysis

## Building your own Machine Learning Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 

df['cleaned_text'] = df['text'].apply(clean_text) 
df = df.apply(tokenize_and_remove_stopwords, axis=1) 
df['final_text'] = df['processed_text'].apply(lemmatize_text) 
tfidf_vectorizer = TfidfVectorizer(max_features=1000) 

X = tfidf_vectorizer.fit_transform(df['final_text']) 
y = df['airline_sentiment'] 

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 

model = LogisticRegression(max_iter=1000) 
model.fit(X_train, y_train) 

In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out() 
class_labels = model.classes_ 

for index, class_label in enumerate(class_labels): 
    coefficients = model.coef_[index] 
    coefficients_df = pd.DataFrame({ 
        'Feature': feature_names, 
        'Coefficient': coefficients 
    }) 

    coefficients_df['Absolute_Coefficient'] = coefficients_df['Coefficient'].abs() 
    coefficients_df = coefficients_df.sort_values(by='Absolute_Coefficient', ascending=False) 
    print(f"Class: {class_label}") 
    print(coefficients_df[['Feature', 'Coefficient']].head(10)) 

In [None]:
from sklearn.metrics import accuracy_score, classification_report 

y_pred = model.predict(X_test) 
print(classification_report(y_test, y_pred)) 

In [None]:
from sklearn.metrics import confusion_matrix 
import seaborn as sns 

cm = confusion_matrix(y_test, y_pred, labels=['negative', 'neutral', 'positive']) 
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive']) 

plt.ylabel('Actual') 
plt.xlabel('Predicted') 
plt.title('Confusion Matrix') 
plt.show() 

In [None]:
gold_df = df[df['airline_sentiment_gold'].notnull()] 

X_gold = tfidf_vectorizer.transform(gold_df['final_text']) 
y_gold = gold_df['airline_sentiment_gold'] 
y_gold_pred = model.predict(X_gold) 

gold_df['predicted_sentiment'] = y_gold_pred 

misclassified = gold_df[gold_df['airline_sentiment_gold'] != gold_df['predicted_sentiment']] 
misclassified[['airline_sentiment_gold', 'predicted_sentiment', 'text', 'final_text', 'negativereason_gold']]  

## Using pre-trained LLMs

In [None]:
from tqdm.auto import tqdm 
import time 

filtered_df = df[df['airline_sentiment'] != 'neutral'] 
X = filtered_df['text'] 
y = filtered_df['airline_sentiment'] 
X_train_texts, X_test_texts, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") 
start_time = time.time() 
results = [] 

for text in tqdm(X_test_texts, desc="Analyzing sentiments"): 
    result = sentiment_pipeline(text) 
    results.append(result[0]['label'].lower())  

end_time = time.time() 
total_time = end_time - start_time 

print(f"Total time for analyzing {len(X_test_texts)} tweets: {total_time:.2f} seconds") 

In [None]:
print(classification_report(y_test, results)) 

In [None]:
cm = confusion_matrix(y_test, results, labels=['negative', 'positive'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['negative', 'positive'], yticklabels=['negative', 'positive'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()

# Translating Sentiment into Actionable Insights

In [None]:
!pip install tweepy
import tweepy
# Replace these with your API keys and tokens
consumer_key = 'YOUR_CONSUMER_KEY'
consumer_secret = 'YOUR_CONSUMER_SECRET'
access_token = 'YOUR_ACCESS_TOKEN'
access_token_secret = 'YOUR_ACCESS_TOKEN_SECRET'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

query = "@YourBrandHandle -filter:retweets"
tweets = api.search_tweets(q=query, lang="en", count=100)

In [None]:
data = [{
    'tweet_id': tweet.id,
    'text': tweet.text,
    'tweet_created': tweet.created_at,
    'tweet_location': tweet.user.location,
    } for tweet in tweets]
your_brand_df = pd.DataFrame(data)


In [None]:
nlp = spacy.load("en_core_web_sm")

reviews = [
    "I recently purchased a sleeping bag from Optimal Hiking Gear and it exceeded my expectations.",
    "The tent I bought from Optimal Hiking was damaged on arrival. Very disappointed.",
    "The Optimal Hiking company makes a backpack that’s the best. I've been using mine for years without any issues."
]
for review in reviews:
    doc = nlp(review)
    for ent in doc.ents:
        print(f"Entity: {ent.text}, Label: {ent.label_}")

In [None]:
df.negativereason.value_counts()

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud 

tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2)) 
tfidf_matrix = tfidf_vectorizer.fit_transform(df['final_text']) 
tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.sum(axis=0).tolist()[0])) 
wordcloud_tfidf = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(tfidf_scores) 

plt.figure(figsize=(10, 5)) 
plt.imshow(wordcloud_tfidf, interpolation='bilinear') 
plt.axis('off') 
plt.show() 

In [None]:
import nltk  

def extract_hashtags(text): 
    return re.findall(r"#(\w+)", text) 

hashtags = sum(df['text'].apply(extract_hashtags).tolist(), []) 
hashtag_freq_dist = nltk.FreqDist(hashtags) 
wordcloud_hashtags = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(hashtag_freq_dist) 

plt.figure(figsize=(10, 5)) 
plt.imshow(wordcloud_hashtags, interpolation='bilinear') 
plt.axis('off') 
plt.show() 

In [None]:
from sklearn.decomposition import LatentDirichletAllocation 
from sklearn.feature_extraction.text import CountVectorizer 

count_vect = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') 
doc_term_matrix = count_vect.fit_transform(df['final_text']) 
LDA = LatentDirichletAllocation(n_components=5, random_state=42) 
LDA.fit(doc_term_matrix) 

for i, topic in enumerate(LDA.components_): 
    print(f"Top words for topic #{i}:") 
    print([count_vect.get_feature_names_out()[index] for index in topic.argsort()[-10:]]) 
    print("\n") 

In [None]:
df['tweet_created'] = pd.to_datetime(df['tweet_created']).dt.tz_convert(None)
df['date'] = df['tweet_created'].dt.date
airline_handle = "@JetBlue"
airline_tweets = df[df.text.str.contains(airline_handle)]
grouped = airline_tweets.groupby(['airline_sentiment', 'date']).agg({'tweet_id':'count', 'retweet_count':'sum'}).reset_index()
positive_tweets = grouped[grouped['airline_sentiment'] == 'positive']
neutral_tweets = grouped[grouped['airline_sentiment'] == 'neutral']
negative_tweets = grouped[grouped['airline_sentiment'] == 'negative']
plt.figure(figsize=(14, 7))
scale_factor = 3
for tweets, sentiment, color, linestyle in zip(
    [positive_tweets, neutral_tweets, negative_tweets], 
    ['Positive', 'Neutral', 'Negative'], 
    ['green', 'orange', 'red'], 
    ['-', '--', '-.']
):
    scaled_retweet_count = tweets['retweet_count'] * scale_factor
    plt.plot(tweets['date'], tweets['tweet_id'], linestyle=linestyle, label=sentiment, color=color)
    plt.scatter(tweets['date'], tweets['tweet_id'], scaled_retweet_count, color=color)
plt.title(f'Daily Sentiment Trend for {airline_handle} with Bubble Size Indicating Retweets')
plt.xlabel('Date')
plt.ylabel('Number of Tweets')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
dates_of_interest = [pd.to_datetime('2015-02-22').date(), pd.to_datetime('2015-02-23').date(), pd.to_datetime('2015-02-24').date()]
filtered_df = airline_tweets[(airline_tweets['date'].isin(dates_of_interest)) & (airline_tweets['airline_sentiment'] == 'negative')]
top_tweets_per_date = filtered_df.groupby('date').apply(lambda x: x.nlargest(3, 'retweet_count'))
top_tweets_per_date[['text', 'retweet_count', 'negativereason']]

In [None]:
!pip install folium 

In [None]:
import folium
from folium.plugins import HeatMap

filtered_df = df[(df['text'].str.contains('@JetBlue') & (df['airline_sentiment'] == 'negative'))]
filtered_df = filtered_df.dropna(subset=['tweet_coord'])
valid_coords = []
for coord in filtered_df['tweet_coord']:
    try:
        lat, long = eval(coord)
        valid_coords.append((lat, long))
    except (TypeError, SyntaxError, NameError):
        continue
if valid_coords:
    map_center = [sum(x)/len(valid_coords) for x in zip(*valid_coords)]
else:
    map_center = [0, 0]
tweet_map = folium.Map(location=map_center, zoom_start=4)
HeatMap(valid_coords).add_to(tweet_map)
tweet_map