In [1]:
# Import necessary libraries
import pandas as pd
import os

In [2]:
# Define the current date manually
CURRENT_DATE = "2024-12-20"

# Get the current working directory
current_script_dir = os.getcwd()

# Construct the daily data file path by going two folders back and locating the file
daily_data_path = os.path.join(current_script_dir, "..", "..", f"data/daily_crypto_reddit_merged/{CURRENT_DATE}/merged_crypto_reddit_data.csv")

# Normalize the path for consistency
daily_data_path = os.path.normpath(daily_data_path)

print(f"Daily data path: {daily_data_path}")

Daily data path: /Users/vanditgupta/Vandit/Github/Projects/CryptoForecastPro/data/daily_crypto_reddit_merged/2024-12-20/merged_crypto_reddit_data.csv


In [3]:
# Load data
daily_data = pd.read_csv(daily_data_path)

In [4]:
# Display basic information
print("Daily Data Overview:")
print(daily_data.info())

Daily Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             1 non-null      object 
 1   Symbol           1 non-null      object 
 2   Open             1 non-null      float64
 3   High             1 non-null      float64
 4   Low              1 non-null      float64
 5   Close            1 non-null      float64
 6   Sentiment_Score  1 non-null      float64
 7   Score            1 non-null      int64  
 8   Comments         1 non-null      int64  
 9   Title            1 non-null      object 
 10  Content          1 non-null      object 
 11  Row_Count        1 non-null      int64  
 12  Sentiment_Label  1 non-null      object 
 13  Sentiment_Lag_1  0 non-null      float64
 14  Score_Lag_1      0 non-null      float64
 15  Comments_Lag_1   0 non-null      float64
 16  Sentiment_Lag_3  0 non-null      float64
 17 

In [5]:
# Check for missing values
print("\nMissing Values in Daily Data:")
print(daily_data.isnull().sum())


Missing Values in Daily Data:
Date               0
Symbol             0
Open               0
High               0
Low                0
Close              0
Sentiment_Score    0
Score              0
Comments           0
Title              0
Content            0
Row_Count          0
Sentiment_Label    0
Sentiment_Lag_1    1
Score_Lag_1        1
Comments_Lag_1     1
Sentiment_Lag_3    1
Score_Lag_3        1
Comments_Lag_3     1
Sentiment_Lag_7    1
Score_Lag_7        1
Comments_Lag_7     1
dtype: int64


In [6]:
# Fill missing values
daily_data['Symbol'].fillna('Unknown', inplace=True)
daily_data.fillna(0, inplace=True) 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  daily_data['Symbol'].fillna('Unknown', inplace=True)


In [7]:
# Validate date column
daily_data['Date'] = pd.to_datetime(daily_data['Date'], errors='coerce')
print("\nInvalid Dates in Daily Data:")
print(daily_data[daily_data['Date'].isnull()])


Invalid Dates in Daily Data:
Empty DataFrame
Columns: [Date, Symbol, Open, High, Low, Close, Sentiment_Score, Score, Comments, Title, Content, Row_Count, Sentiment_Label, Sentiment_Lag_1, Score_Lag_1, Comments_Lag_1, Sentiment_Lag_3, Score_Lag_3, Comments_Lag_3, Sentiment_Lag_7, Score_Lag_7, Comments_Lag_7]
Index: []

[0 rows x 22 columns]


In [8]:
# Ensure numeric columns are valid
numeric_columns = ['Open', 'High', 'Low', 'Close', 'Sentiment_Score', 'Score', 'Comments']
for col in numeric_columns:
    daily_data[col] = pd.to_numeric(daily_data[col], errors='coerce')

In [9]:
# Drop rows with invalid numeric data
daily_data.dropna(subset=numeric_columns, inplace=True)

In [10]:
# Ensure Sentiment_Label contains valid categories
valid_labels = ['Positive', 'Neutral', 'Negative']
daily_data['Sentiment_Label'] = daily_data['Sentiment_Label'].where(
    daily_data['Sentiment_Label'].isin(valid_labels), 'Neutral'
)

In [11]:
# Remove duplicates
daily_data.drop_duplicates(inplace=True)

In [12]:
# Define the file paths dynamically
cleaned_daily_data_path = os.path.join(current_script_dir, "..", "..", f"data/daily_crypto_reddit_merged/{CURRENT_DATE}/cleaned_daily_data.csv")

# Normalize the path for consistency
cleaned_daily_data_path = os.path.normpath(cleaned_daily_data_path)

# Save the cleaned daily data
daily_data.to_csv(cleaned_daily_data_path, index=False)
print(f"Cleaned daily data saved to '{cleaned_daily_data_path}'")


Cleaned daily data saved to '/Users/vanditgupta/Vandit/Github/Projects/CryptoForecastPro/data/daily_crypto_reddit_merged/2024-12-20/cleaned_daily_data.csv'
