In [None]:
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from textblob.sentiments import PatternAnalyzer
from tqdm import tqdm

import pandas as pd
import os
import emoji
import re
import string
import matplotlib.pyplot as plt
import time

# Data Preprocessing for Sentiment Analysis

Before we dive into sentiment analysis, it's essential to preprocess our dataset to ensure optimal results. The following steps will guide us through this preprocessing phase:

## Lowercasing
All text will be converted to lowercase. This uniformity ensures words such as "Ferry", "ferry", and "FERRY" are treated identically.

**Why?** 
To eliminate case-sensitivity and maintain data consistency.

##  Remove Punctuation
Punctuations can act as noise in our sentiment analysis.

**Why?**
Punctuation does not usually carry sentiment and can introduce inconsistency in the text representation.

##  Remove Numbers and Special Characters
Unless numbers carry a specific sentiment in the context of your data, they can be safely removed or replaced.

**Why?**
Numbers, in most cases, don't convey any sentiment.

##  Remove Stop Words
Words like "and", "the", "is", etc., are prevalent and don't typically contribute to sentiment.

**Why?**
Removing stop words helps in reducing the dimensionality of the data and focusing on words that carry sentiment. However, exercise caution as, in some instances, they might alter the sentiment's context.

##  Lemmatization
Words will be reduced to their root form using stemming.

**Why?**
By reducing words to their base form, stemming can help in reducing the dimensionality of our dataset and focusing on the core meaning of words.

##  Remove URLs
We will eliminate any URLs present in the reviews.

**Why?**
URLs are common in social media content but don't contribute to sentiment.

##  Remove Usernames/Handles
Any platform-specific usernames or handles will be removed.

**Why?**
Like URLs, usernames don't provide sentiment information and are more identifiers than content.

##  Spell Correction
We'll correct the spelling of the words in the reviews.

**Why?**
Misspellings can introduce noise. Correcting them ensures we're analyzing sentiments based on actual words. However, some misspelled words might have a unique sentiment value, so tread with caution.

##  Handle Emojis
Emojis will be converted to text or analyzed separately, as they can often carry sentiment.

**Why?**
Emojis are increasingly becoming a form of expression and can provide valuable sentiment insights.

##  Remove Empty Reviews
Rows that contain no text will be filtered out from the dataset.

**Why?**
Empty reviews can not be analysed on sentiment

## Remove Duplicate Reviews
Any repeated reviews across platforms or locations will be removed.

**Why?**
Duplicate reviews can skew the analysis by giving undue weightage to repeated sentiments.

In the following cells, we will programmatically implement these preprocessing steps.


In [None]:
# Read the CSV file into a DataFrame
df_original = pd.read_csv("twitter_output.csv")
df = pd.read_csv("Tweets_english.csv")

In [None]:
original_length = len(df_original)
original_length

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df

In [None]:
# Remove entries where caption is NaN
df = df.dropna(subset=["Tweets_english"])

In [None]:
# Convert the 'caption' column text to lowercase
# df['caption'] = df['caption'].str.lower()
df.loc[:, 'Tweets_english'] = df['Tweets_english'].str.lower()

In [None]:
# Remove punctuation from the 'caption' column
df['Tweets_english'] = df['Tweets_english'].str.replace(f"[{string.punctuation}]", "", regex=True)

In [None]:
# Remove numbers and special characters from the 'caption' column
df['Tweets_english'] = df['Tweets_english'].str.replace(r"[0-9!&$*#]", "", regex=True)

In [None]:
# Regular expression pattern for URLs
url_pattern = r'http\S+|www\S+'

# Remove URLs from the 'caption' column
df['Tweets_english'] = df['Tweets_english'].str.replace(url_pattern, '', regex=True)

In [None]:
# Regular expression pattern for usernames/handles
username_pattern = r'@\w+'

# Remove usernames from the 'caption' column
df['Tweets_english'] = df['Tweets_english'].str.replace(username_pattern, '', regex=True)

In [None]:
# Change emojis to their text equivalent meaning
def convert_emojis_to_text(text):
    return emoji.demojize(text)

df['Tweets_english'] = df['Tweets_english'].apply(convert_emojis_to_text)

In [None]:
# Filter out rows where 'caption' is either empty or just whitespace
df = df[df['Tweets_english'].str.strip() != '']

In [None]:
# pd.set_option('display.max_colwidth', None)
df['Tweets_english'] = df['Tweets_english'].str.replace(r'^\s+', '', regex=True).str.replace(r'\s+$', '', regex=True)

In [None]:
# Translate df from dutch to english
def translate_text(text):
    try:
        translated_text = GoogleTranslator(source='auto', target='en').translate(text=text)
        time.sleep(0.2)  # sleep for 0.2 seconds
        return translated_text
    except Exception as e:
        print(f"Error translating text: {text}. Error: {e}")
        return text  # return original text if translation fails

tqdm.pandas()
df['Tweets_english'] = df['Tweets'].progress_apply(translate_text)

In [None]:
# # Remove stopwords from captions
stop = set(stopwords.words('english')) # for English stop words
def remove_stopwords(text):
    if isinstance(text, str):  # Check if the input is a string
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stop]
        return ' '.join(filtered_words)
    return text  # If not a string, return the input as is

df['Tweets_english'] = df['Tweets_english'].apply(remove_stopwords)

In [None]:
# # Initialize a WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a lemmatization function
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to the 'caption' column
df['Tweets_english'] = df['Tweets_english'].apply(lemmatize_text)

In [None]:
# Remove duplicate rows based on the 'caption' column
df = df.drop_duplicates(subset='Tweets_english', keep='first')

In [None]:
new_length = len(df)
print(f"Google reviews dataset contains: {new_length} samples")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

counts = [original_length, new_length]
labels = ['Total scraped', 'After Preprocessing']

plt.figure(figsize=(10, 6))
sns.barplot(x=labels, y=counts, palette='viridis')
plt.ylabel('Number of twitter posts')
plt.title('Number of twitter posts Before and After Data Preprocessing')
plt.show()

In [None]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns  # Ensure seaborn is imported

# Tokenize words in the tweets
words = ' '.join(df['Tweets_english'].dropna()).lower()  # Join all tweets and convert to lowercase
tokens = word_tokenize(words)

# Remove punctuation and filter stopwords
filtered_tokens = [word for word in tokens if word.isalpha() and word not in stopwords.words('english')]

# Count the frequency of each word
freq_dist = nltk.FreqDist(filtered_tokens)

# Get the top 31 most common words (including the first word we want to skip)
common_words = freq_dist.most_common(31)

# Skip the first word
common_words = common_words[1:]

# Separate the words and their counts for plotting
words, counts = zip(*common_words)
word_count_dict = dict(zip(words, counts))
print(word_count_dict)
# Use a colormap to generate colors for each word
colors = sns.color_palette("viridis", len(words))

# Plot the most common words with different colors
plt.figure(figsize=(12, 10))
plt.barh(words, counts, color=colors)
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.title('Top 30 Most Common Words in Twitter Posts (excluding ferry)')
plt.gca().invert_yaxis()  # Invert y-axis to have the most common word on top
plt.show()


In [None]:
polarities = [TextBlob(word).sentiment.polarity for word in words]

# Plot the most common words with their sentiment
plt.figure(figsize=(12, 10))

# Use a colormap to generate colors based on sentiment
colors = [sns.color_palette("RdBu", 10)[int(5*(polarity + 1))] for polarity in polarities]

plt.barh(words, counts, color=colors)
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.title('Top 30 Most Common Words in Twitter posts with Sentiment Coloring')

# Create the colorbar with the desired range
mappable = plt.cm.ScalarMappable(cmap="RdBu", norm=plt.Normalize(vmin=-1, vmax=1))
plt.colorbar(mappable, orientation="vertical", label="Sentiment")

plt.gca().invert_yaxis()  # Invert y-axis to have the most common word on top
plt.show()
