In [1]:
import pandas as pd

# Assuming the CSV file is named 'raw_integrated_data.csv' and located within a 'data' folder at the same level as our Jupyter notebook
file_path = 'data/raw_integrated_data.csv'
df = pd.read_csv(file_path, parse_dates=['timestamp'])


  df = pd.read_csv(file_path, parse_dates=['timestamp'])


In [2]:
# Basic structure and data types
print(df.info())

# Preliminary statistics
print(df.describe())

# Quick assessment of missing values per column
print(df.isnull().sum())

# Preview the dataset
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75778 entries, 0 to 75777
Data columns (total 57 columns):
 #   Column                                              Non-Null Count  Dtype         
---  ------                                              --------------  -----         
 0   timestamp                                           75778 non-null  datetime64[ns]
 1   low                                                 75778 non-null  float64       
 2   high                                                75778 non-null  float64       
 3   open                                                75778 non-null  float64       
 4   close                                               75778 non-null  float64       
 5   volume                                              75778 non-null  float64       
 6   sma_30                                              75778 non-null  float64       
 7   rsi                                                 75778 non-null  float64       
 8   macd  

# Helper Functions

In [4]:
def first_measurement_time(df):
    first_measurements = {}
    for column in df.columns:
        # Ignoring non-numeric and timestamp columns for "first measurement" context
        if df[column].dtype == 'object' or column == 'timestamp':
            continue
        non_null_data = df[df[column].notnull()]
        if not non_null_data.empty:
            first_measurements[column] = non_null_data['timestamp'].min()
    return first_measurements


In [5]:
import numpy as np

def average_interval_between_measurements(df):
    average_intervals = {}
    for column in df.columns:
        # Ignoring non-numeric and timestamp columns for "average interval" context
        if df[column].dtype == 'object' or column == 'timestamp':
            continue
        non_null_data = df[df[column].notnull()]
        if non_null_data.shape[0] > 1:
            # Calculate differences between consecutive non-null timestamps
            time_diffs = non_null_data['timestamp'].diff().dropna()
            # Convert time differences to a compatible numeric format, e.g., hours
            average_interval = time_diffs.mean()
            average_intervals[column] = average_interval
        else:
            average_intervals[column] = np.nan  # Indicates a single measurement or none
    return average_intervals


In [6]:
# Call the functions on our dataset
first_measurement_times = first_measurement_time(df)
average_intervals = average_interval_between_measurements(df)

# Convert the outputs to DataFrames for easy sharing
first_measurement_table = pd.DataFrame(list(first_measurement_times.items()), columns=['Variable', 'First Measurement Time'])
average_interval_table = pd.DataFrame(list(average_intervals.items()), columns=['Variable', 'Average Interval Between Measurements'])

# Display the tables
print("First Measurement Time for Each Variable:")
print(first_measurement_table)
print("\nAverage Interval Between Measurements for Each Variable:")
print(average_interval_table)


First Measurement Time for Each Variable:
                                             Variable First Measurement Time
0                                                 low    2015-07-22 02:00:00
1                                                high    2015-07-22 02:00:00
2                                                open    2015-07-22 02:00:00
3                                               close    2015-07-22 02:00:00
4                                              volume    2015-07-22 02:00:00
5                                              sma_30    2015-07-22 02:00:00
6                                                 rsi    2015-07-22 02:00:00
7                                                macd    2015-07-22 02:00:00
8                                            dia_open    2015-07-22 06:00:00
9                                            dia_high    2015-07-22 06:00:00
10                                            dia_low    2015-07-22 06:00:00
11                                

In [7]:
def are_timestamps_round_hours(df):
    """
    Checks if all timestamps in the 'timestamp' column of the dataframe are on a round hour.
    
    Parameters:
    df (pd.DataFrame): Dataframe with a 'timestamp' column.
    
    Returns:
    bool: True if all timestamps are on a round hour, False otherwise.
    """
    # Extract minutes and seconds from each timestamp
    minutes = df['timestamp'].dt.minute
    seconds = df['timestamp'].dt.second
    
    # Check if all minutes and seconds are 0
    all_round_hours = (minutes == 0) & (seconds == 0)
    
    return all_round_hours.all()

result = are_timestamps_round_hours(df)
print("All timestamps are on a round hour:", result)


All timestamps are on a round hour: True


In [8]:
def forward_fill_from_first_measurement(df):
    """
    Forward fills missing values for each column in the DataFrame from the first
    non-null value observed for that column.
    
    Parameters:
    df (pd.DataFrame): The DataFrame with variables across different intervals.
    
    Returns:
    pd.DataFrame: The DataFrame after applying the forward fill operation.
    """
    filled_df = df.copy()
    for column in filled_df.columns:
        if column != 'timestamp':  # Exclude the 'timestamp' column from filling
            # Forward fill the column, starting from the first non-null value
            filled_df[column] = filled_df[column].ffill()
    
    return filled_df

filled_df = forward_fill_from_first_measurement(df)

# Count missing values for every column

In [9]:
missing_values_after_fill = filled_df.isnull().sum()

# Display the count of missing values for each column
print("Missing values for each column after forward fill:")
print(missing_values_after_fill)


Missing values for each column after forward fill:
timestamp                                                 0
low                                                       0
high                                                      0
open                                                      0
close                                                     0
volume                                                    0
sma_30                                                    0
rsi                                                       0
macd                                                      0
dia_open                                                  4
dia_high                                                  4
dia_low                                                   4
dia_close                                                 4
dia_volume                                                4
gld_open                                                  2
gld_high                                         

In [None]:
# In order to avoid backfilling filter events starting from 2016

In [10]:
# 'timestamp' is the name of our datetime column in the filled_df DataFrame
filled_df_filtered = filled_df[filled_df['timestamp'] >= '2016-01-01']

In [11]:
# Basic structure and data types
print(filled_df_filtered.info())

# Preliminary statistics
print(filled_df_filtered.describe())

# Quick assessment of missing values per column
print(filled_df_filtered.isnull().sum())

# Preview the dataset
print(filled_df_filtered.head())


<class 'pandas.core.frame.DataFrame'>
Index: 71868 entries, 3910 to 75777
Data columns (total 57 columns):
 #   Column                                              Non-Null Count  Dtype         
---  ------                                              --------------  -----         
 0   timestamp                                           71868 non-null  datetime64[ns]
 1   low                                                 71868 non-null  float64       
 2   high                                                71868 non-null  float64       
 3   open                                                71868 non-null  float64       
 4   close                                               71868 non-null  float64       
 5   volume                                              71868 non-null  float64       
 6   sma_30                                              71868 non-null  float64       
 7   rsi                                                 71868 non-null  float64       
 8   macd    

# Final validation checks

In [13]:
# Identifying and reporting duplicate rows
duplicates_all = filled_df_filtered.duplicated().sum()
timestamp_duplicates = filled_df_filtered.duplicated(subset=['timestamp']).sum()
print(f"Total duplicate rows: {duplicates_all}")
print(f"Duplicate timestamps: {timestamp_duplicates}")

# Identifying rows with negative values for 'volume' and 'close'
negative_volume_count = filled_df_filtered[filled_df_filtered['volume'] < 0].shape[0]
negative_close_count = filled_df_filtered[filled_df_filtered['close'] < 0].shape[0]
print(f"Rows with negative volume: {negative_volume_count}")
print(f"Rows with negative close: {negative_close_count}")


Total duplicate rows: 0
Duplicate timestamps: 905
Rows with negative volume: 0
Rows with negative close: 0


In [14]:
# Identifying duplicate timestamps and selecting the rows
duplicates_by_timestamp = filled_df_filtered[filled_df_filtered.duplicated(subset=['timestamp'], keep=False)]

# Sorting by 'timestamp' to ensure duplicates are grouped together for viewing
sorted_duplicates = duplicates_by_timestamp.sort_values(by='timestamp')

# Displaying the first 100 duplicates
print(sorted_duplicates.head(100))


                timestamp       low      high      open     close  \
64866 2022-12-16 14:00:00  16895.69  17085.53  16947.31  17034.07   
64867 2022-12-16 14:00:00  16895.69  17085.53  16947.31  17034.07   
64868 2022-12-16 14:00:00  16895.69  17085.53  16947.31  17034.07   
64869 2022-12-16 15:00:00  16912.63  17035.93  17033.38  16972.18   
64870 2022-12-16 15:00:00  16912.63  17035.93  17033.38  16972.18   
...                   ...       ...       ...       ...       ...   
64999 2022-12-19 14:00:00  16667.90  16765.23  16710.66  16736.58   
65002 2022-12-19 14:00:00  16667.90  16765.23  16710.66  16736.58   
65007 2022-12-19 15:00:00  16664.32  16742.31  16736.58  16666.54   
65008 2022-12-19 15:00:00  16664.32  16742.31  16736.58  16666.54   
65009 2022-12-19 15:00:00  16664.32  16742.31  16736.58  16666.54   

            volume        sma_30        rsi        macd  dia_open  ...  value  \
64866  3912.720814  17366.694667  33.864182 -148.420783   320.032  ...   29.0   
64867  39

In [15]:
# Assuming duplicates_by_timestamp contains all rows with duplicate timestamps
if not duplicates_by_timestamp.empty:
    first_duplicate_timestamp = duplicates_by_timestamp['timestamp'].min()
    last_duplicate_timestamp = duplicates_by_timestamp['timestamp'].max()
    print(f"First duplicate timestamp: {first_duplicate_timestamp}")
    print(f"Last duplicate timestamp: {last_duplicate_timestamp}")
else:
    print("No duplicates found.")


First duplicate timestamp: 2022-12-16 14:00:00
Last duplicate timestamp: 2023-12-31 23:00:00


In [None]:
# This leads us to believe the duplicates came from merging the btc news dataset (start 2022, end 2023)

In [16]:
# Group the DataFrame by 'timestamp' and check if all values within each group are identical
duplicate_groups = duplicates_by_timestamp.groupby('timestamp')

# Initialize a dictionary to hold the result
duplicate_analysis = {'timestamp': [], 'identical_values': []}

for name, group in duplicate_groups:
    # Assuming at least one duplicate (i.e., group size > 1), compare all rows in the group
    if group.shape[0] > 1:
        # Drop the 'timestamp' column for comparison, as we already know it's identical within the group
        without_timestamp = group.drop(columns='timestamp')
        # Compare all rows in the group (excluding the timestamp) to the first row
        identical_rows = without_timestamp.eq(without_timestamp.iloc[0]).all().all()
        # Record the analysis
        duplicate_analysis['timestamp'].append(name)
        duplicate_analysis['identical_values'].append(identical_rows)

# Convert the analysis result to a DataFrame for easy viewing
analysis_df = pd.DataFrame(duplicate_analysis)

# Check the analysis DataFrame to see the result
print(analysis_df)


              timestamp  identical_values
0   2022-12-16 14:00:00             False
1   2022-12-16 15:00:00             False
2   2022-12-16 16:00:00             False
3   2022-12-16 18:00:00             False
4   2022-12-16 20:00:00             False
..                  ...               ...
347 2023-12-31 05:00:00             False
348 2023-12-31 11:00:00             False
349 2023-12-31 12:00:00             False
350 2023-12-31 20:00:00             False
351 2023-12-31 23:00:00             False

[352 rows x 2 columns]


### We keep 2 datasets, as news was grouped per hour. The identical values mean different articles were published in the same hour.

In [21]:
# Save the DataFrame with duplicates
filled_df_filtered.to_csv('data/filled_df_filtered_with_duplicates.csv', index=False)

# Remove duplicate timestamps, keeping the first occurrence
# This results in a dataset without duplicates where for each timestamp, only the first occurrence is kept
filled_df_filtered_no_duplicates = filled_df_filtered.drop_duplicates(subset=['timestamp'], keep='first')

# Save the DataFrame without duplicates
# filled_df_filtered_no_duplicates.csv for the dataset with duplicates removed,
# keeping only the first occurrence of each timestamp.
filled_df_filtered_no_duplicates.to_csv('data/filled_df_filtered_no_duplicates.csv', index=False)


# Verification

In [22]:
# Check for duplicate timestamps in the no duplicates DataFrame
duplicate_timestamps_check = filled_df_filtered_no_duplicates.duplicated(subset=['timestamp']).sum()

# Print the result to confirm if duplicates exist
print(f"Number of duplicate timestamps in the no duplicates DataFrame: {duplicate_timestamps_check}")


Number of duplicate timestamps in the no duplicates DataFrame: 0
