
1. **Data Loading:**
   Read the Biomass_History.csv and Distance_Matrix.csv files to load the historical biomass availability and travel distance data into Python.


In [92]:
import pandas as pd

# Load Biomass_History.csv
biomass_history = pd.read_csv("../data/Biomass_History.csv")

# Load Distance_Matrix.csv
distance_matrix = pd.read_csv("../data/Distance_Matrix.csv")


2. **Data Preprocessing:**
   Processing the data to handle missing values and organize it for further analysis.


In [93]:
import pandas as pd

# List of columns to be combined (2010 to 2017)
columns_to_combine = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']

# Melt the DataFrame to combine the columns into a single column while retaining the year information
biomass_data = pd.melt(biomass_history, id_vars=['Index', 'Longitude', 'Latitude'], value_vars=columns_to_combine, var_name='Year', value_name='Value')

# Display the updated DataFrame
print(biomass_data)



       Index  Longitude  Latitude  Year      Value
0          0   71.33144  24.66818  2010   8.475744
1          1   71.41106  24.66818  2010  24.029778
2          2   71.49069  24.66818  2010  44.831635
3          3   71.57031  24.66818  2010  59.974419
4          4   71.64994  24.66818  2010  14.653370
...      ...        ...       ...   ...        ...
19339   2413   72.84432  20.15456  2017   5.321604
19340   2414   72.92394  20.15456  2017   0.120626
19341   2415   73.00357  20.15456  2017   0.038879
19342   2416   73.08319  20.15456  2017   1.304297
19343   2417   73.16282  20.15456  2017   0.226953

[19344 rows x 5 columns]


In [94]:
distance_matrix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2418 entries, 0 to 2417
Columns: 2419 entries, Unnamed: 0 to 2417
dtypes: float64(2418), int64(1)
memory usage: 44.6 MB


3. **Biomass Forecasting: Using Linear Regression**
   Use time-series forecasting methods like ARIMA or Prophet to predict biomass availability for the years 2018 and 2019.


4. **Optimal Asset Locations:**
   Implement an optimization algorithm (e.g., Genetic Algorithm, Simulated Annealing, or Linear Programming) to find the optimal locations for preprocessing depots and biorefineries based on the forecasted biomass data and distance matrix.


In [95]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression


# Assuming 'biomass_data' contains columns: 'Year', 'Longitude', 'Latitude', and 'Value'
# Convert the 'Year' column to numeric
biomass_data['Year'] = pd.to_numeric(biomass_data['Year'])

# Limit the data to only the first 500 rows (index 0 to 499)
#biomass_data = biomass_data.iloc[0:50]
# Group the data by 'Longitude' and 'Latitude'
groups = biomass_data.groupby(['Longitude', 'Latitude'])

# Prepare the output DataFrame and save to submission.csv
header = ["year", "data_type", "source_index", "destination_index", "value"]
output_data = pd.DataFrame(columns=header)

# Iterate through each group, train the model, and make predictions
for idx, (group_name, group_data) in enumerate(groups):
    longitude, latitude = group_name
    train_data = group_data[group_data['Year'] <= 2017]
    test_data = group_data[group_data['Year'] >= 2018]

    # Create a linear regression model
    model = LinearRegression()

    # Fit the model to the training data
    model.fit(train_data[['Year']], train_data['Value'])

    # Predict biomass availability for the years 2018 and 2019
    years_to_predict = [2018, 2019]
    predictions = model.predict(np.array(years_to_predict).reshape(-1, 1))

    # Save the predictions for depot_location and refinery_location
    output_data = output_data.append({"year": "20182019", "data_type": "depot_location", "source_index": idx, "destination_index": '', "value": None}, ignore_index=True)
    output_data = output_data.append({"year": "20182019", "data_type": "refinery_location", "source_index": idx, "destination_index": '', "value": None}, ignore_index=True)

    # Save the predictions for biomass_forecast
    for year, prediction in zip(years_to_predict, predictions.flatten()):
        output_data = output_data.append({"year": year, "data_type": "biomass_forecast", "source_index": idx, "destination_index": '', "value": prediction}, ignore_index=True)

# Save the output to solution.csv with header
output_data.to_csv('../data/prediction.csv', index=False)


  output_data = output_data.append({"year": "20182019", "data_type": "depot_location", "source_index": idx, "destination_index": '', "value": None}, ignore_index=True)
  output_data = output_data.append({"year": "20182019", "data_type": "refinery_location", "source_index": idx, "destination_index": '', "value": None}, ignore_index=True)
  output_data = output_data.append({"year": year, "data_type": "biomass_forecast", "source_index": idx, "destination_index": '', "value": prediction}, ignore_index=True)
  output_data = output_data.append({"year": year, "data_type": "biomass_forecast", "source_index": idx, "destination_index": '', "value": prediction}, ignore_index=True)
  output_data = output_data.append({"year": "20182019", "data_type": "depot_location", "source_index": idx, "destination_index": '', "value": None}, ignore_index=True)
  output_data = output_data.append({"year": "20182019", "data_type": "refinery_location", "source_index": idx, "destination_index": '', "value": None}, i

5. **Submission:**
   Prepare the output in the desired format and save it to the sample_submission.csv file.


In [96]:
import pandas as pd

# Read the data from solution.csv
submission = pd.read_csv('../data/prediction.csv')

# Sort the data based on data_type column (depot_location, refinery_location, biomass_forecast)
# and sort the 'year' column in ascending order (2018 comes before 2019)
submission = submission.sort_values(by=['data_type', 'year'], key=lambda x: x.map({'depot_location': 1, 'refinery_location': 2, 'biomass_forecast': 3}))

# Separate depot_location data
depot_location_submission = submission[submission['data_type'] == 'depot_location']

# Separate refinery_location data
refinery_location_submission = submission[submission['data_type'] == 'refinery_location']

# Separate biomass_forecast data and sort by year in ascending order
biomass_forecast_submission = submission[submission['data_type'] == 'biomass_forecast']
biomass_forecast_submission = biomass_forecast_submission.sort_values(by='year')

# Concatenate the dataframes with depot_location, refinery_location, and biomass_forecast in desired order
submission = pd.concat([depot_location_submission, refinery_location_submission, biomass_forecast_submission])

# Save the sorted data to solution.csv with header
submission.to_csv('../data/submission.csv', index=False)
