In [None]:
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from textblob.sentiments import PatternAnalyzer
from collections import Counter
import numpy as np
import pandas as pd
import os
import emoji
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_fb = pd.read_csv("fb_scraper_output_english_final.csv")
df_gl = pd.read_csv("google_cleaned.csv")
df_tw = pd.read_csv("Tweets_english.csv")

In [None]:
sentiment_df_fb = df_fb[["post_description_english"]]
sentiment_df_gl = df_gl[["caption", "rating", "location"]]
sentiment_df_tw = df_tw[["Tweets_english"]]
sentiment_df_fb = sentiment_df_fb[sentiment_df_fb['post_description_english'].apply(lambda x: not isinstance(x, float))]

In [None]:
# Function to get polarity
def get_polarity(text):
    return TextBlob(text).sentiment.polarity

# Function to get subjectivity
def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Apply the functions to sentiment_df
sentiment_df_fb['polarity'] = sentiment_df_fb['post_description_english'].apply(get_polarity)
sentiment_df_fb['subjectivity'] = sentiment_df_fb['post_description_english'].apply(get_subjectivity)

sentiment_df_gl['polarity'] = sentiment_df_gl['caption'].apply(get_polarity)
sentiment_df_gl['subjectivity'] = sentiment_df_gl['caption'].apply(get_subjectivity)

sentiment_df_tw['polarity'] = sentiment_df_tw['Tweets_english'].apply(get_polarity)
sentiment_df_tw['subjectivity'] = sentiment_df_tw['Tweets_english'].apply(get_subjectivity)

In [None]:
# Categorizing reviews based on polarity scores directly for plotting
sentiments_fb = ['positive' if score > 0.05 else 'negative' if score < -0.05 else 'neutral' for score in sentiment_df_fb['polarity']]
sentiment_gl = ['positive' if score > 0.05 else 'negative' if score < -0.05 else 'neutral' for score in sentiment_df_gl['polarity']]
sentiment_tw = ['positive' if score > 0.05 else 'negative' if score < -0.05 else 'neutral' for score in sentiment_df_tw['polarity']]
# Plotting the distribution
plt.figure(figsize=(10, 6))
sns.countplot(sentiments_fb, order=['positive', 'neutral', 'negative'], palette='viridis')
plt.title('Distribution of Reviews by Sentiment')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(15, 6))
sns.distplot(sentiment_df_fb['polarity'], bins=20, hist=True, kde=True, label='Facebook', color='r')
# Adding titles and labels
plt.title("Distribution of Polarity Scores from Facebook Data")
plt.xlabel('Polarity Score')
plt.ylabel('Density')

plt.xlim([-1.2,1.2])

mean_polarity = sentiment_df_fb['polarity'].mean()
median_polarity = sentiment_df_fb['polarity'].median()
std_polarity = sentiment_df_fb['polarity'].std()

plt.axvline(mean_polarity, color='k', linestyle='--', label=f'Mean: {mean_polarity:.2f}')
plt.axvline(median_polarity, color='b', linestyle='-', label=f'Median: {median_polarity:.2f}')
plt.legend()
plt.show()

print(f"Mean of Polarity: {mean_polarity}")
print(f"Median of Polarity: {median_polarity}")
print(f"Standard Deviation of Polarity: {std_polarity}")
skewness = sentiment_df_fb['polarity'].skew()
print(f"Skewness of the polarity distribution: {skewness}")


In [None]:
# Create a figure with 2 rows and 1 column
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 8), gridspec_kw={'height_ratios': [3, 1]})

# Create the distribution plot in the first subplot
sns.distplot(sentiment_df_gl['polarity'], bins=20, hist=True, kde=True, label='Google', color='g', ax=ax1)
ax1.set_title("Distribution of Polarity Scores from Google Reviews Data")
ax1.set_xlabel('Polarity Score')
ax1.set_ylabel('Density')
mean_polarity = sentiment_df_gl['polarity'].mean()
median_polarity = sentiment_df_gl['polarity'].median()
std_polarity = sentiment_df_gl['polarity'].std()
ax1.axvline(mean_polarity, color='k', linestyle='--', label=f'Mean: {mean_polarity:.2f}')
ax1.axvline(median_polarity, color='b', linestyle='-', label=f'Median: {median_polarity:.2f}')
ax1.legend()

# Create the horizontal boxplot in the second subplot
sns.boxplot(x=sentiment_df_gl['polarity'], ax=ax2, color='g', orient='h')
ax2.set_xlabel('Polarity Score')
ax2.set_yticks([])  # Remove the y-tick since it's redundant in this context

# Display the plots
plt.tight_layout()  # Ensures that the plots don't overlap
plt.show()

skewness = sentiment_df_gl['polarity'].skew()
print(f"Skewness of the polarity distribution: {skewness}")



In [None]:
from scipy.stats import skew

s = skew(data)
print(f"Skewness: {s}")

if abs(s) < 0.5:
    print("The distribution is approximately symmetric.")
else:
    print("The distribution is not symmetric.")


In [None]:
import scipy.stats as stats

stats.probplot(data, dist="norm", plot=plt)
plt.show()


In [None]:
import pandas as pd

# Assuming you have already loaded your data into a DataFrame called sentiment_df_gl
data = sentiment_df_gl['polarity']

# Split data based on the value 0.30
left_of_0_30 = data[data < 0.30]
right_of_0_30 = data[data > 0.30]


# Calculate the sum of values to the left and right of 0.30
sum_left = left_of_0_30.count()
sum_right = right_of_0_30.count()

# Print the results
print(f"Sum of values to the left of 0.30: {sum_left}")
print(f"Sum of values to the right of 0.30: {sum_right}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Setting up the figure
plt.figure(figsize=(15, 6))

# Plotting histograms with KDE for each DataFrame
sns.distplot(sentiment_df_tw['polarity'], bins=20, hist=True, kde=True, label='Twitter', color='b')

# Setting the x-axis limits to [-1, 1] since polarity should be in this range
# plt.xlim([-1, 1.3])

# Adding titles and labels
plt.title("Distribution of Polarity Scores from Twitter Data")
plt.xlabel('Polarity Score')
plt.ylabel('Density')
mean_polarity = sentiment_df_tw['polarity'].mean()
median_polarity = sentiment_df_tw['polarity'].median()
std_polarity = sentiment_df_tw['polarity'].std()
plt.axvline(mean_polarity, color='k', linestyle='--', label=f'Mean: {mean_polarity:.2f}')
plt.axvline(median_polarity, color='b', linestyle='-', label=f'Median: {median_polarity:.2f}')

plt.legend()

# Displaying the plot
plt.show()
skewness = sentiment_df_tw['polarity'].skew()
print(f"Skewness of the polarity distribution: {skewness}")

In [None]:
from scipy.stats import gaussian_kde


# Compute the KDE for the data
def compute_kde(data):
    density = gaussian_kde(data, bw_method=0.5)
    x_vals = np.linspace(min(data)-1, max(data)+1, 1000)
    y_vals = density(x_vals)
    return x_vals, y_vals

# Compute KDEs for the polarity columns from each DataFrame
x_vals_fb, y_vals_fb = compute_kde(sentiment_df_fb['polarity'])
x_vals_gl, y_vals_gl = compute_kde(sentiment_df_gl['polarity'])
x_vals_tw, y_vals_tw = compute_kde(sentiment_df_tw['polarity'])

# Plotting
plt.figure(figsize=(12, 6))
plt.plot(x_vals_fb, y_vals_fb, color='r', label='Facebook')
plt.plot(x_vals_gl, y_vals_gl, color='g', label='Google')
plt.plot(x_vals_tw, y_vals_tw, color='b', label='Twitter')

plt.xlim(-1.2, 1.2)
plt.title('Comparative density plot for the polarity score of the social media platforms')
plt.xlabel('Polarity Score')
plt.ylabel('Density')
plt.legend()
plt.show()
