In [11]:
import pandas as pd

# Initialize an empty list to store the precipitation data
precipitation_data = []

# Initialize a list for the date columns (Year, Month, Day)
dates = []

# Loop through each file (from 1.csv to 20.csv)
for i in range(1, 21):
    # Read the CSV file
    filename = f'{i}.csv'
    df = pd.read_csv(filename)

    # Clean column names by stripping any extra spaces
    df.columns = df.columns.str.strip()

    # Debugging: Print column names to check for correct 'precipitation' column
    print(f"Columns in {filename}: {df.columns}")
    
    # Filter out rows where precipitation is -99.99 (invalid data)
    valid_precipitation = df[df['precipitation'] != -99.99]
    
    # Collect the date information
    if i == 1:
        # Capture the dates only once
        dates = valid_precipitation[['YEAR', 'MONTH', 'DAY']]

    # Add the valid precipitation data to the list for further processing
    if len(precipitation_data) == 0:
        precipitation_data = valid_precipitation[['YEAR', 'MONTH', 'DAY', 'precipitation']].copy()
    else:
        # Merge the data with the existing list of valid data
        precipitation_data = pd.merge(precipitation_data, valid_precipitation[['YEAR', 'MONTH', 'DAY', 'precipitation']],
                                      on=['YEAR', 'MONTH', 'DAY'], how='inner', suffixes=('', f'_from_{i}'))

# Ensure we only have valid precipitation values (non -99.99) and merge on the common dates
# Calculate the average precipitation across all files for each date, by taking the mean
precipitation_data['PRECIPITATION'] = precipitation_data[[col for col in precipitation_data.columns if 'precipitation' in col]].mean(axis=1)

# Create a new column 'DATE' in 'YYYY-MM-DD' format
precipitation_data['DATE'] = precipitation_data.apply(lambda row: f"{int(row['YEAR'])}-{int(row['MONTH']):02d}-{int(row['DAY']):02d}", axis=1)

# Keep only the 'DATE' and 'PRECIPITATION' columns
final_data = precipitation_data[['DATE', 'PRECIPITATION']]

# Export the DataFrame to a new CSV file
final_data.to_csv('combined_precipitation_data.csv', index=False)

# Display the combined data preview
print(final_data.head())


Columns in 1.csv: Index(['COOPID', 'YEAR', 'MONTH', 'DAY', 'precipitation'], dtype='object')
Columns in 2.csv: Index(['COOPID', 'YEAR', 'MONTH', 'DAY', 'precipitation'], dtype='object')
Columns in 3.csv: Index(['COOPID', 'YEAR', 'MONTH', 'DAY', 'precipitation'], dtype='object')
Columns in 4.csv: Index(['COOPID', 'YEAR', 'MONTH', 'DAY', 'precipitation'], dtype='object')
Columns in 5.csv: Index(['COOPID', 'YEAR', 'MONTH', 'DAY', 'precipitation'], dtype='object')
Columns in 6.csv: Index(['COOPID', 'YEAR', 'MONTH', 'DAY', 'precipitation'], dtype='object')
Columns in 7.csv: Index(['COOPID', 'YEAR', 'MONTH', 'DAY', 'precipitation'], dtype='object')
Columns in 8.csv: Index(['COOPID', 'YEAR', 'MONTH', 'DAY', 'precipitation'], dtype='object')
Columns in 9.csv: Index(['COOPID', 'YEAR', 'MONTH', 'DAY', 'precipitation'], dtype='object')
Columns in 10.csv: Index(['COOPID', 'YEAR', 'MONTH', 'DAY', 'precipitation'], dtype='object')
Columns in 11.csv: Index(['COOPID', 'YEAR', 'MONTH', 'DAY', 'precipit

In [2]:
import pandas as pd

# Create a list of dates from 2002 to 2024
dates = pd.date_range(start="2002-01-01", end="2024-12-31", freq="A")

# Create the LOSS AREA values (100000, 200000, ..., 2300000)
loss_area_values = [100000 * (i + 1) for i in range(len(dates))]

# Create a DataFrame
data = pd.DataFrame({
    'DATE': dates,
    'LOSS AREA': loss_area_values
})

# Save to CSV
data.to_csv('fake_loss_acreage.csv', index=False)

# Show the first few rows of the DataFrame to verify
print(data.head())


        DATE  LOSS AREA
0 2002-12-31     100000
1 2003-12-31     200000
2 2004-12-31     300000
3 2005-12-31     400000
4 2006-12-31     500000


  dates = pd.date_range(start="2002-01-01", end="2024-12-31", freq="A")


In [5]:
import pandas as pd

# Create a list of all dates from January 1st, 2002 to December 31st, 2024
dates = pd.date_range(start="2002-01-01", end="2024-12-31", freq="D")

# Create the LOSS AREA values (100000, 200000, ..., 2300000) following an arithmetic progression
# We want the LOSS AREA to increase by 100000 each year
loss_area_values = [100000 * ((date.year - 2002) + 1) for date in dates]

# Create a DataFrame
data = pd.DataFrame({
    'DATE': dates,
    'LOSS AREA': loss_area_values
})

# Save to CSV
data.to_csv('fake_loss_acreage.csv', index=False)

# Show the first few rows of the DataFrame to verify
print(data.head())


        DATE  LOSS AREA
0 2002-01-01     100000
1 2002-01-02     100000
2 2002-01-03     100000
3 2002-01-04     100000
4 2002-01-05     100000


In [7]:
import pandas as pd

# Load the cleaned data (from the first file 'cleaned_data.csv')
cleaned_data = pd.read_csv('cleaned_data.csv')

# Load the fake loss acreage data (from the second file 'fake_loss_acreage.csv')
loss_acreage_data = pd.read_csv('fake_loss_acreage.csv')

# Convert 'DATE' columns to datetime format for both datasets
cleaned_data['DATE'] = pd.to_datetime(cleaned_data['DATE'], format='%Y-%m-%d')
loss_acreage_data['DATE'] = pd.to_datetime(loss_acreage_data['DATE'], format='%Y-%m-%d')

# Merge the two datasets on the 'DATE' column
merged_data = pd.merge(cleaned_data, loss_acreage_data, on='DATE', how='left')

# Handle missing values by forward filling (recommended way)
merged_data = merged_data.ffill()

# Alternatively, if you want to drop rows with missing values (uncomment below)
# merged_data = merged_data.dropna()

# Save the combined dataset to a new CSV file
merged_data.to_csv('combined_data.csv', index=False)

# Show the first few rows of the combined data to verify
print(merged_data.head())


        DATE  PRECIPITATION  CLOSE PRICE  LOSS AREA
0 2002-01-02         0.2945    92.949997     100000
1 2002-01-03         0.2010    91.349998     100000
2 2002-01-04         0.0000    94.250000     100000
3 2002-01-07         0.0340    93.500000     100000
4 2002-01-08         0.0010    93.500000     100000
