In [1]:
# Import necessary libraries
import pandas as pd
import os

In [2]:
# Get the current working directory
current_script_dir = os.getcwd()

# Define the file path dynamically
historical_data_path = os.path.join(current_script_dir, "..", "data", "historical_crypto_reddit_merged_historical", "merged_crypto_reddit_historical_data.csv")

# Normalize the path for consistency
historical_data_path = os.path.normpath(historical_data_path)

# Load historical data
historical_data = pd.read_csv(historical_data_path)

print(f"Historical data loaded from: {historical_data_path}")

Historical data loaded from: /Users/vanditgupta/Vandit/Github/Projects/CryptoForecastPro/data/historical_crypto_reddit_merged_historical/merged_crypto_reddit_historical_data.csv


In [3]:
# Load historical data
historical_data = pd.read_csv(historical_data_path)

In [4]:
# Display basic information
print("Historical Data Overview:")
print(historical_data.info())

Historical Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8318 entries, 0 to 8317
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             8318 non-null   object 
 1   Symbol           8318 non-null   object 
 2   Open             8318 non-null   float64
 3   High             8318 non-null   float64
 4   Low              8318 non-null   float64
 5   Close            8318 non-null   float64
 6   Sentiment_Score  8318 non-null   float64
 7   Score            8318 non-null   int64  
 8   Comments         8318 non-null   int64  
 9   Title            8318 non-null   object 
 10  Content          8318 non-null   object 
 11  Row_Count        8318 non-null   int64  
 12  Sentiment_Label  8318 non-null   object 
 13  Sentiment_Lag_1  8308 non-null   float64
 14  Score_Lag_1      8308 non-null   float64
 15  Comments_Lag_1   8308 non-null   float64
 16  Sentiment_Lag_3  8288 non-null   f

In [5]:
# Check for missing values
print("\nMissing Values in Historical Data:")
print(historical_data.isnull().sum())


Missing Values in Historical Data:
Date                0
Symbol              0
Open                0
High                0
Low                 0
Close               0
Sentiment_Score     0
Score               0
Comments            0
Title               0
Content             0
Row_Count           0
Sentiment_Label     0
Sentiment_Lag_1    10
Score_Lag_1        10
Comments_Lag_1     10
Sentiment_Lag_3    30
Score_Lag_3        30
Comments_Lag_3     30
Sentiment_Lag_7    70
Score_Lag_7        70
Comments_Lag_7     70
dtype: int64


In [6]:
# Fill missing values
historical_data['Symbol'].fillna('Unknown', inplace=True)
historical_data.fillna(0, inplace=True)  # Replace other missing values with 0 for simplicity


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  historical_data['Symbol'].fillna('Unknown', inplace=True)


In [7]:
# Validate date column
historical_data['Date'] = pd.to_datetime(historical_data['Date'], errors='coerce')
print("\nInvalid Dates in Historical Data:")
print(historical_data[historical_data['Date'].isnull()])


Invalid Dates in Historical Data:
Empty DataFrame
Columns: [Date, Symbol, Open, High, Low, Close, Sentiment_Score, Score, Comments, Title, Content, Row_Count, Sentiment_Label, Sentiment_Lag_1, Score_Lag_1, Comments_Lag_1, Sentiment_Lag_3, Score_Lag_3, Comments_Lag_3, Sentiment_Lag_7, Score_Lag_7, Comments_Lag_7]
Index: []

[0 rows x 22 columns]


In [8]:
# Ensure numeric columns are valid
numeric_columns = ['Open', 'High', 'Low', 'Close', 'Sentiment_Score', 'Score', 'Comments']
for col in numeric_columns:
    historical_data[col] = pd.to_numeric(historical_data[col], errors='coerce')


In [9]:
# Drop rows with invalid numeric data
historical_data.dropna(subset=numeric_columns, inplace=True)

In [10]:
# Ensure Sentiment_Label contains valid categories
valid_labels = ['Positive', 'Neutral', 'Negative']
historical_data['Sentiment_Label'] = historical_data['Sentiment_Label'].where(
    historical_data['Sentiment_Label'].isin(valid_labels), 'Neutral'
)

In [11]:
# Remove duplicates
historical_data.drop_duplicates(inplace=True)

In [12]:
# Define the output file path dynamically
cleaned_historical_data_path = os.path.join(current_script_dir, "..", "data", "historical_crypto_reddit_merged_historical", "cleaned_historical_data.csv")

# Normalize the path for consistency
cleaned_historical_data_path = os.path.normpath(cleaned_historical_data_path)

# Save the cleaned historical data
historical_data.to_csv(cleaned_historical_data_path, index=False)
print(f"Cleaned historical data saved to '{cleaned_historical_data_path}'")


Cleaned historical data saved to '/Users/vanditgupta/Vandit/Github/Projects/CryptoForecastPro/data/historical_crypto_reddit_merged_historical/cleaned_historical_data.csv'
