In [None]:
# Reading data

In [39]:
import polars as pl

data = pl.read_csv("data_update.csv", parse_dates=False)
data = data.with_columns(pl.col("date").str.slice(-7).alias("date_monthly"))
data = data.with_columns(pl.col("date").str.strptime(pl.Date, fmt="%d-%m-%Y"))

In [40]:
data

Unnamed: 0_level_0,headline,date,date_monthly
i64,str,date,str
0,"""Over 4 Million...",2022-09-23,"""09-2022"""
1,"""American Airli...",2022-09-23,"""09-2022"""
2,"""23 Of The Funn...",2022-09-23,"""09-2022"""
3,"""The Funniest T...",2022-09-23,"""09-2022"""
4,"""Woman Who Call...",2022-09-22,"""09-2022"""
5,"""Cleaner Was De...",2022-09-22,"""09-2022"""
6,"""Reporter Gets ...",2022-09-22,"""09-2022"""
7,"""Puerto Ricans ...",2022-09-22,"""09-2022"""
8,"""How A New Docu...",2022-09-22,"""09-2022"""
9,"""Biden At UN To...",2022-09-21,"""09-2022"""


In [41]:
# Text data pre-processing

In [42]:
from cleantext import clean

def preprocess(text):
    output = clean(str(text), punct=True,
                              extra_spaces=True,
                              stopwords=True,
                              lowercase=True,
                              numbers = True)
    return output

In [43]:
data_clean = data.with_columns([
    pl.col("headline").apply(preprocess)
])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [44]:
# Text data representation

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# calculate the compound score
def sentiment_vader(sentence):

    # create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()

    sentiment_dict = sid_obj.polarity_scores(sentence)

    # create overall (compound) indicator
    compound = sentiment_dict['compound']

    return compound

In [45]:
# apply the function with Polars

sentiment = data_clean.with_columns([
    pl.col("headline").apply(sentiment_vader)
])

In [None]:
# Time-series representation

In [52]:
# aggregate over months

timeseries = (sentiment.lazy()
    .groupby("date_monthly")
    .agg(
        [
            pl.avg("headline")
        ]
    ).sort("date_monthly")
).collect()
