In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

# Read the data
df = pd.read_csv('../../Data/News/sentiment_score_per_day.csv', parse_dates=['Date'])

# Set the Date column as the index
df.set_index('Date', inplace=True)

# Create a complete date range
start_date = df.index.min()
end_date = df.index.max()
all_dates = pd.date_range(start=start_date, end=end_date, freq='D')

# Reindex the DataFrame with the complete date range
df_filled = df.reindex(all_dates)

# Create a feature based on the date (day of the year)
df_filled['day_of_year'] = df_filled.index.dayofyear

# Normalize the day of the year feature
scaler = MinMaxScaler()
df_filled['day_of_year_scaled'] = scaler.fit_transform(df_filled[['day_of_year']])

# Prepare data for KNN imputer
X = df_filled[['day_of_year_scaled', 'sentiment']].values

# Initialize and fit KNN imputer
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X)

# Create a new DataFrame with imputed values
df_imputed = pd.DataFrame(X_imputed, columns=['day_of_year_scaled', 'sentiment'], index=df_filled.index)

# Keep only the sentiment column and reset index
df_imputed = df_imputed[['sentiment']].reset_index()

# Rename the index column to 'Date'
df_imputed.columns = ['Date', 'sentiment']

# Save the result to a new CSV file
df_imputed.to_csv('../../Data/IndoBERT_KNN_Sentiment_Score.csv', index=False, date_format='%Y-%m-%d')

print(df_imputed)

           Date  sentiment
0    2020-01-01        3.0
1    2020-01-02       -2.0
2    2020-01-03        2.0
3    2020-01-04        1.0
4    2020-01-05        1.0
...         ...        ...
1730 2024-09-26        0.0
1731 2024-09-27       -1.2
1732 2024-09-28        0.6
1733 2024-09-29        0.0
1734 2024-09-30       -1.0

[1735 rows x 2 columns]
