In [4]:
import re
import pandas as pd

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def load_lexicon(file_path):
    lexicon = pd.read_csv(file_path, sep='\t', header=0, names=['word', 'weight'])
    lexicon['weight'] = pd.to_numeric(lexicon['weight'], errors='coerce')
    lexicon['length'] = lexicon['word'].apply(lambda x: len(x.split()))
    return lexicon.sort_values(by='length', ascending=False).reset_index(drop=True)

positive_lexicon = load_lexicon('positive.tsv')
negative_lexicon = load_lexicon('negative.tsv')

def calculate_sentiment(text, positive_lexicon, negative_lexicon):
    text = preprocess_text(text)
    sentiment_score = 0
    words = text.split()
    
    for i in range(len(words)):
        for j in range(len(words), i, -1):
            phrase = ' '.join(words[i:j])
            
            pos_match = positive_lexicon[positive_lexicon['word'] == phrase]
            if not pos_match.empty:
                sentiment_score += pos_match.iloc[0]['weight']
                words[i:j] = [''] * (j-i)
                break
            
            neg_match = negative_lexicon[negative_lexicon['word'] == phrase]
            if not neg_match.empty:
                sentiment_score += neg_match.iloc[0]['weight']
                words[i:j] = [''] * (j-i)
                break
    
    return sentiment_score

# Example usage
example_text = "jalan terbuka putus tali gantung"
sentiment_score = calculate_sentiment(example_text, positive_lexicon, negative_lexicon)
print(f"Sentiment score: {sentiment_score}")

Sentiment score: 1


In [9]:
df = pd.read_csv('../../Data/News/combined_data_sorted.csv')

In [6]:
# apply calculate_sentiment to all rows and save the result in a new column called 'sentiment_score_lexicon'
df['Lexicon Sentiment Score'] = df['Title'].apply(lambda x: calculate_sentiment(x, positive_lexicon, negative_lexicon))

In [7]:
df

Unnamed: 0,Date,Title,Lexicon Sentiment Score
0,2020-01-01,"Mengenal Lagi Naturalisasi, Cara Anies Basweda...",2
1,2020-01-01,Streaming! Upaya Sarinah Menjadi Pusat Pengemb...,7
2,2020-01-01,Terungkap! Pembeli Pejaten Village Juga Invest...,-1
3,2020-01-01,Perhatikan 5 Pertanyaan Ini Sebelum Merancang ...,2
4,2020-01-02,Saham Blue Chip Bisa Digoreng Juga?,0
...,...,...,...
5420,2024-09-25,"IHSG Diprediksi Menguat, Simak Analisis dan Re...",7
5421,2024-09-25,"Bikin Investor Happy, The Fed Diprediksi Bakal...",5
5422,2024-09-26,"Suku Bunga Turun, Kemenperin Optimis Iklim Usa...",2
5423,2024-09-26,"Kurang Darah, IHSG ke Zona Merah Pagi Ini",1


In [8]:
# save the dataframe
df.to_csv('../../Data/lexicon_sentiment_score.csv', index=False)

## Group by Date

In [13]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# Load the saved dataframe
df = pd.read_csv('../../Data/lexicon_sentiment_score.csv')

# Convert the Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Calculate the average sentiment score for each day
average_sentiment_score = df.groupby('Date')['Lexicon Sentiment Score'].mean()

# Create a complete date range
start_date = average_sentiment_score.index.min()
end_date = average_sentiment_score.index.max()
all_dates = pd.date_range(start=start_date, end=end_date, freq='D')

# Reindex the Series with the complete date range
average_sentiment_filled = average_sentiment_score.reindex(all_dates)

# Create a feature based on the date (day of the year)
average_sentiment_filled = average_sentiment_filled.to_frame()  # Convert Series to DataFrame for further processing
average_sentiment_filled['day_of_year'] = average_sentiment_filled.index.dayofyear

# Normalize the day of the year feature
scaler = MinMaxScaler()
average_sentiment_filled['day_of_year_scaled'] = scaler.fit_transform(average_sentiment_filled[['day_of_year']])

# Prepare data for KNN imputer
X = average_sentiment_filled[['day_of_year_scaled', 'Lexicon Sentiment Score']].values

# Initialize and fit KNN imputer
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X)

# Create a new DataFrame with imputed values
df_imputed = pd.DataFrame(X_imputed, columns=['day_of_year_scaled', 'Lexicon Sentiment Score'], index=average_sentiment_filled.index)

# Keep only the sentiment column and reset index
df_imputed = df_imputed[['Lexicon Sentiment Score']].reset_index()

# Rename the index column to 'Date'
df_imputed.columns = ['Date', 'Lexicon Sentiment Score']

# Save the result to a new CSV file
df_imputed.to_csv('../../Data/lexicon_average_sentiment_score_imputed.csv', index=False, date_format='%Y-%m-%d')

print(df_imputed)

           Date  Lexicon Sentiment Score
0    2020-01-01                 2.500000
1    2020-01-02                 1.666667
2    2020-01-03                 6.000000
3    2020-01-04                 0.000000
4    2020-01-05                 0.000000
...         ...                      ...
1730 2024-09-26                 1.500000
1731 2024-09-27                -0.146667
1732 2024-09-28                 1.316667
1733 2024-09-29                 1.633333
1734 2024-09-30                 5.000000

[1735 rows x 2 columns]
