In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import plotly.express as px
warnings.filterwarnings("ignore")

In [33]:
#load the data into dataframe

df = pd.read_csv("2023_9.csv")
df.head()

Unnamed: 0,Title,Publisher,DateTime,Link,Category
0,"Chainlink (LINK) Falters, Hedera (HBAR) Wobble...",Analytics Insight,2023-08-30T06:54:49Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business
1,Funds punished for owning too few Nvidia share...,ZAWYA,2023-08-30T07:15:59Z,https://news.google.com/articles/CBMigwFodHRwc...,Business
2,Crude oil prices stalled as hedge funds sold: ...,ZAWYA,2023-08-30T07:31:31Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business
3,Grayscale's Bitcoin Win Is Still Only Half the...,Bloomberg,2023-08-30T10:38:40Z,https://news.google.com/articles/CBMib2h0dHBzO...,Business
4,"I'm a Home Shopping Editor, and These Are the ...",Better Homes & Gardens,2023-08-30T11:00:00Z,https://news.google.com/articles/CBMiPWh0dHBzO...,Business


In [34]:
df.dtypes

Title        object
Publisher    object
DateTime     object
Link         object
Category     object
dtype: object

In [35]:
df["Date"] = pd.to_datetime(df["DateTime"]).dt.date
df["Time"] = pd.to_datetime(df["DateTime"]).dt.time
df.head()

Unnamed: 0,Title,Publisher,DateTime,Link,Category,Date,Time
0,"Chainlink (LINK) Falters, Hedera (HBAR) Wobble...",Analytics Insight,2023-08-30T06:54:49Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business,2023-08-30,06:54:49
1,Funds punished for owning too few Nvidia share...,ZAWYA,2023-08-30T07:15:59Z,https://news.google.com/articles/CBMigwFodHRwc...,Business,2023-08-30,07:15:59
2,Crude oil prices stalled as hedge funds sold: ...,ZAWYA,2023-08-30T07:31:31Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business,2023-08-30,07:31:31
3,Grayscale's Bitcoin Win Is Still Only Half the...,Bloomberg,2023-08-30T10:38:40Z,https://news.google.com/articles/CBMib2h0dHBzO...,Business,2023-08-30,10:38:40
4,"I'm a Home Shopping Editor, and These Are the ...",Better Homes & Gardens,2023-08-30T11:00:00Z,https://news.google.com/articles/CBMiPWh0dHBzO...,Business,2023-08-30,11:00:00


In [36]:
import pytz  # Time zones
import re  # Regular expressions
from tqdm import tqdm  # Progress bars
import random  # Random values

import nltk  # Natural language processing
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA  # Sentiment analysis
nltk.download('vader_lexicon')  # Download VADER lexicon
from nltk.corpus import stopwords  # Stopwords
nltk.download('stopwords')  # Download stopwords corpus

from wordcloud import WordCloud  # Word cloud

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\I355833\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\I355833\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
# Set the plotting style to 'darkgrid'.
sns.set_style('darkgrid')

# Create an instance of the SentimentIntensityAnalyzer for sentiment analysis.
sia = SIA()

# Get the stopwords for the Portuguese language.
stpwrds = stopwords.words('english')

# Define a custom color map for sentiment categories based on Kaggle palette.
custom_colors_map = {
    'Negative': '#6495ED',  # Cornflower Blue
    'Neutral': '#A9A9A9',  # Dark Gray
    'Positive': '#00CED1'  # Dark Turquoise
}




## Text Transformation

In [38]:
# General transformation in the text
def transformation(df, mc,):
    
    df[mc] = df[mc].str.lower()
    df[mc] = df[mc].apply(lambda x: re.sub('@[^\s]+', '', x))
    df[mc] = df[mc].apply(lambda x: re.sub(r'\B#\S+', '', x))
    df[mc] = df[mc].apply(lambda x: re.sub(r"http\S+", "", x))
    df[mc] = df[mc].apply(lambda x: ' '.join(re.findall(r'\w+', x)))
    df[mc] = df[mc].apply(lambda x: re.sub(r'\s+[b-zA-Z]\s+', ' ', x))
    df[mc] = df[mc].apply(lambda x: re.sub(r'\s+', ' ', x, flags=re.I))
    df[mc] = df[mc].apply(lambda x: ' '.join([word for word in x.split() if word not in stpwrds]))

    df['words'] = df[mc].apply(lambda x: re.findall(r'\w+', x))
    df['words_count'] = df.words.apply(len)
    df['length'] = df[mc].apply(len)

    return df


In [39]:
data = transformation(df,'Title')
data.head()

Unnamed: 0,Title,Publisher,DateTime,Link,Category,Date,Time,words,words_count,length
0,chainlink link falters hedera hbar wobbles yet...,Analytics Insight,2023-08-30T06:54:49Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business,2023-08-30,06:54:49,"[chainlink, link, falters, hedera, hbar, wobbl...",12,74
1,funds punished owning nvidia shares stunning 2...,ZAWYA,2023-08-30T07:15:59Z,https://news.google.com/articles/CBMigwFodHRwc...,Business,2023-08-30,07:15:59,"[funds, punished, owning, nvidia, shares, stun...",8,54
2,crude oil prices stalled hedge funds sold kemp,ZAWYA,2023-08-30T07:31:31Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business,2023-08-30,07:31:31,"[crude, oil, prices, stalled, hedge, funds, so...",8,46
3,grayscale bitcoin win still half battle,Bloomberg,2023-08-30T10:38:40Z,https://news.google.com/articles/CBMib2h0dHBzO...,Business,2023-08-30,10:38:40,"[grayscale, bitcoin, win, still, half, battle]",6,39
4,home shopping editor miss labor day deals eyeing,Better Homes & Gardens,2023-08-30T11:00:00Z,https://news.google.com/articles/CBMiPWh0dHBzO...,Business,2023-08-30,11:00:00,"[home, shopping, editor, miss, labor, day, dea...",8,48


## Sentiment Analsysis
The code performs sentiment analysis on the text data using the VADER model from the NLTK library. It calculates sentiment scores and classifies them into three categories: 'Negative', 'Positive', or 'Neutral'. The sentiment scores are stored in the 'sentiment_eval' column, and the corresponding classifications are stored in the 'class_sentiment' column.


In [40]:
def sentiment_classification(x: float):
    return 'Negative' if x < -0.25 else 'Positive' if x > 0.25 else 'Neutral'

data['sentiment_eval'] = [sia.polarity_scores(x)['compound'] for x in tqdm(data['Title'])]

data['class_sentiment'] = data['sentiment_eval'].apply(sentiment_classification)

100%|██████████| 51302/51302 [00:04<00:00, 11287.02it/s]


In [41]:
data.head()

Unnamed: 0,Title,Publisher,DateTime,Link,Category,Date,Time,words,words_count,length,sentiment_eval,class_sentiment
0,chainlink link falters hedera hbar wobbles yet...,Analytics Insight,2023-08-30T06:54:49Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business,2023-08-30,06:54:49,"[chainlink, link, falters, hedera, hbar, wobbl...",12,74,0.0,Neutral
1,funds punished owning nvidia shares stunning 2...,ZAWYA,2023-08-30T07:15:59Z,https://news.google.com/articles/CBMigwFodHRwc...,Business,2023-08-30,07:15:59,"[funds, punished, owning, nvidia, shares, stun...",8,54,0.2023,Neutral
2,crude oil prices stalled hedge funds sold kemp,ZAWYA,2023-08-30T07:31:31Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business,2023-08-30,07:31:31,"[crude, oil, prices, stalled, hedge, funds, so...",8,46,-0.6705,Negative
3,grayscale bitcoin win still half battle,Bloomberg,2023-08-30T10:38:40Z,https://news.google.com/articles/CBMib2h0dHBzO...,Business,2023-08-30,10:38:40,"[grayscale, bitcoin, win, still, half, battle]",6,39,0.296,Positive
4,home shopping editor miss labor day deals eyeing,Better Homes & Gardens,2023-08-30T11:00:00Z,https://news.google.com/articles/CBMiPWh0dHBzO...,Business,2023-08-30,11:00:00,"[home, shopping, editor, miss, labor, day, dea...",8,48,-0.1531,Neutral


## Exploratory Data Analysis and Visualization

In [42]:
df["Category"].value_counts()

Category
Sports           7709
Headlines        7155
Entertainment    7003
Technology       6822
Business         6800
Worldwide        6575
Health           4877
Science          4361
Name: count, dtype: int64

In [43]:
cat = df["Category"].unique()
cat

array(['Business', 'Entertainment', 'Headlines', 'Health', 'Science',
       'Sports', 'Technology', 'Worldwide'], dtype=object)

In [44]:
data_business = data[data["Category"] == "Business"]
data_business.head()

Unnamed: 0,Title,Publisher,DateTime,Link,Category,Date,Time,words,words_count,length,sentiment_eval,class_sentiment
0,chainlink link falters hedera hbar wobbles yet...,Analytics Insight,2023-08-30T06:54:49Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business,2023-08-30,06:54:49,"[chainlink, link, falters, hedera, hbar, wobbl...",12,74,0.0,Neutral
1,funds punished owning nvidia shares stunning 2...,ZAWYA,2023-08-30T07:15:59Z,https://news.google.com/articles/CBMigwFodHRwc...,Business,2023-08-30,07:15:59,"[funds, punished, owning, nvidia, shares, stun...",8,54,0.2023,Neutral
2,crude oil prices stalled hedge funds sold kemp,ZAWYA,2023-08-30T07:31:31Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business,2023-08-30,07:31:31,"[crude, oil, prices, stalled, hedge, funds, so...",8,46,-0.6705,Negative
3,grayscale bitcoin win still half battle,Bloomberg,2023-08-30T10:38:40Z,https://news.google.com/articles/CBMib2h0dHBzO...,Business,2023-08-30,10:38:40,"[grayscale, bitcoin, win, still, half, battle]",6,39,0.296,Positive
4,home shopping editor miss labor day deals eyeing,Better Homes & Gardens,2023-08-30T11:00:00Z,https://news.google.com/articles/CBMiPWh0dHBzO...,Business,2023-08-30,11:00:00,"[home, shopping, editor, miss, labor, day, dea...",8,48,-0.1531,Neutral


## Count by Sentiment

In [45]:
# Function to create a custom histogram.
def senti_plot(df, category: str):

    fig = px.histogram(
    df['class_sentiment'],
    y="class_sentiment",
    title=f'Sentiment Analysis from {category}',
    color='class_sentiment',
    color_discrete_map=custom_colors_map
    )

    # Customize the hover template to display the sentiment and count.
    fig.update_traces(hovertemplate='%{y}<br>Count: %{x}')

    # Update the layout with additional configurations.
    fig.update_layout(template='plotly_dark', title_x=0.5, yaxis_title='Sentiment', xaxis_title='Count', legend_title='Sentiment')

    # Display the figure.
    return fig

In [47]:
cat_data = {}

for i in cat:
    cat_data[f"data_{i}"] = data[data["Category"] == i]
    
print(cat_data.keys())

dict_keys(['data_Business', 'data_Entertainment', 'data_Headlines', 'data_Health', 'data_Science', 'data_Sports', 'data_Technology', 'data_Worldwide'])


In [62]:
import plotly
plotly.offline.init_notebook_mode(connected=True)

In [65]:
for category, data_frame in cat_data.items():
    
    df = data_frame
    display(senti_plot(df, category))
    