Backfill, Forward Fill, Interpolation and Mean Computation of a Dataset

Getting the dependencies up and running.

In [3]:
import pandas as pd
import numpy as np

Load the data and initial cleanup

In [5]:
try:
    df_original = pd.read_csv('sensor_log.csv')
except FileNotFoundError:
    print("Error: 'sensor_log.csv' not found. Please ensure the file is in the correct directory.")
    # Create a mock DataFrame for demonstration if the file is missing
    data = {
        'timestamp': ['2025-10-01 08:00:00', '2025-10-01 08:00:10', '2025-10-01 08:00:20', 
                      '2025-10-01 08:00:30', '2025-10-01 08:01:00', '2025-10-01 08:02:15', 
                      '2025-10-01 08:03:00', '2025-10-01 08:05:30', '2025-10-01 08:08:00', 
                      '2025-10-01 08:10:00'],
        'temperature_c': [24.5, 24.7, 24.6, np.nan, 24.9, 25.1, 25.3, 25.5, np.nan, 26.0],
        'humidity_pct': [55.2, 55.0, 55.1, 54.9, 54.8, np.nan, 54.7, 54.9, 55.0, 55.1],
        'voltage_v': [3.7, 3.69, np.nan, 3.68, 3.68, 3.67, 3.67, 3.65, 3.64, 3.63]
    }
    df_original = pd.DataFrame(data)

Converting timestamp to datetime and set as index

In [6]:
df_original['timestamp'] = pd.to_datetime(df_original['timestamp'])
df_original = df_original.set_index('timestamp')

print("--- Original DataFrame Head ---")
print(df_original.head())
print("\n--- Count of Missing Values (NaN) in Original Data ---")
print(df_original.isnull().sum())
print("-" * 50)

--- Original DataFrame Head ---
                     temperature_c  humidity_pct  voltage_v
timestamp                                                  
2025-10-01 08:00:00           24.5          55.2       3.70
2025-10-01 08:00:10           24.7          55.0       3.69
2025-10-01 08:00:20           24.6          55.1        NaN
2025-10-01 08:00:30            NaN          54.9       3.68
2025-10-01 08:01:00           24.9          54.8       3.68

--- Count of Missing Values (NaN) in Original Data ---
temperature_c    2
humidity_pct     1
voltage_v        1
dtype: int64
--------------------------------------------------


Forward Fill

In [7]:
df_ffill = df_original.ffill()
print("\n--- Forward Fill (ffill) Result ---")
print(df_ffill.iloc[2:5]) # Show the row that was missing voltage, and the next missing temp


--- Forward Fill (ffill) Result ---
                     temperature_c  humidity_pct  voltage_v
timestamp                                                  
2025-10-01 08:00:20           24.6          55.1       3.69
2025-10-01 08:00:30           24.6          54.9       3.68
2025-10-01 08:01:00           24.9          54.8       3.68


Backward Fill

In [8]:
df_bfill = df_original.bfill()
print("\n--- Backward Fill (bfill) Result ---")
print(df_bfill.iloc[2:5]) # Show the row that was missing voltage, and the next missing temp


--- Backward Fill (bfill) Result ---
                     temperature_c  humidity_pct  voltage_v
timestamp                                                  
2025-10-01 08:00:20           24.6          55.1       3.68
2025-10-01 08:00:30           24.9          54.9       3.68
2025-10-01 08:01:00           24.9          54.8       3.68


Interpolation (Linear)

In [9]:
df_interp = df_original.interpolate(method='linear')
print("\n--- Linear Interpolation Result ---")
print(df_interp.iloc[2:5]) # Show the row that was missing voltage, and the next missing temp


--- Linear Interpolation Result ---
                     temperature_c  humidity_pct  voltage_v
timestamp                                                  
2025-10-01 08:00:20          24.60          55.1      3.685
2025-10-01 08:00:30          24.75          54.9      3.680
2025-10-01 08:01:00          24.90          54.8      3.680


Computing by mean

In [10]:
column_means = df_original.mean()
df_mean = df_original.fillna(column_means)
print("\n--- Mean Imputation Result ---")
print(df_mean.iloc[2:5])


--- Mean Imputation Result ---
                     temperature_c  humidity_pct  voltage_v
timestamp                                                  
2025-10-01 08:00:20         24.600          55.1   3.667778
2025-10-01 08:00:30         25.075          54.9   3.680000
2025-10-01 08:01:00         24.900          54.8   3.680000


Computing Summary Statistics and Comparison

In [11]:
def get_summary_stats(df, name):
    """Computes min, max, mean, and std for all numeric columns."""
    stats = df.agg(['min', 'max', 'mean', 'std']).T
    # Round results for cleaner display
    stats = stats.round(3)
    # Add a column to identify the method
    stats['Method'] = name
    return stats

Computing stats for all dataframes

In [12]:
stats_original = get_summary_stats(df_original, "Original (with NaN)")
stats_ffill = get_summary_stats(df_ffill, "Forward Fill")
stats_bfill = get_summary_stats(df_bfill, "Backward Fill")
stats_interp = get_summary_stats(df_interp, "Interpolation")
stats_mean = get_summary_stats(df_mean, "Mean Imputation")

Combining all statistics into one comparison dataframe

In [17]:
df_comparison = pd.concat([stats_original, stats_ffill, stats_bfill, stats_interp, stats_mean])

print("\n" + "="*50)
print("COMPREHENSIVE SUMMARY STATISTICS COMPARISON")
print("="*50)

df_comparison = df_comparison[['Method', 'min', 'max', 'mean', 'std']]

print(df_comparison.to_markdown())
print("\nNote: The 'Original (with NaN)' mean and std are computed excluding NaN values.")
print("-" * 50)


COMPREHENSIVE SUMMARY STATISTICS COMPARISON
|               | Method              |   min |   max |   mean |   std |
|:--------------|:--------------------|------:|------:|-------:|------:|
| temperature_c | Original (with NaN) | 24.5  |  26   | 25.075 | 0.509 |
| humidity_pct  | Original (with NaN) | 54.7  |  55.2 | 54.967 | 0.158 |
| voltage_v     | Original (with NaN) |  3.63 |   3.7 |  3.668 | 0.023 |
| temperature_c | Forward Fill        | 24.5  |  26   | 25.07  | 0.497 |
| humidity_pct  | Forward Fill        | 54.7  |  55.2 | 54.95  | 0.158 |
| voltage_v     | Forward Fill        |  3.63 |   3.7 |  3.67  | 0.023 |
| temperature_c | Backward Fill       | 24.5  |  26   | 25.15  | 0.542 |
| humidity_pct  | Backward Fill       | 54.7  |  55.2 | 54.94  | 0.171 |
| voltage_v     | Backward Fill       |  3.63 |   3.7 |  3.669 | 0.022 |
| temperature_c | Interpolation       | 24.5  |  26   | 25.11  | 0.513 |
| humidity_pct  | Interpolation       | 54.7  |  55.2 | 54.945 | 0.164 |
| volt