<a href="https://colab.research.google.com/github/VittorioBartolomeoSecondin/DVIS-CAValli_Team/blob/main/Temperatures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data exploration and preprocessing

## Importing libraries and connecting to Google Drive

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob
import re
import json
import csv
import requests
import os
import zipfile
from google.colab import files
from google.colab import drive
drive.mount('/content/drive') # turned out to be a good workaround to load a huge amount of data and keep it available

Mounted at /content/drive


## Reading the data

In [96]:
all_files = glob.glob("/content/drive/MyDrive/tree_dataset/final_dataset/*.txt")
all_files.sort()
print(all_files)

['/content/drive/MyDrive/tree_dataset/final_dataset/climdiv-tmaxst-v1.0.0-20231106.txt', '/content/drive/MyDrive/tree_dataset/final_dataset/climdiv-tminst-v1.0.0-20231106.txt', '/content/drive/MyDrive/tree_dataset/final_dataset/climdiv-tmpcst-v1.0.0-20231106.txt']


In [97]:
column_names = ['Code', 'JanF', 'FebF', 'MarF', 'AprF', 'MayF', 'JunF', 'JulF', 'AugF', 'SepF', 'OctF', 'NovF', 'DecF']
column_data_types = {column_names[0]: str}
column_data_types.update({column_names[i]: float for i in range(1, 13)})
max_dataset = pd.read_csv(all_files[0], delimiter = r'\s+', header = None, names = column_names, dtype = column_data_types)
min_dataset = pd.read_csv(all_files[1], delimiter = r'\s+', header = None, names = column_names, dtype = column_data_types)
avg_dataset = pd.read_csv(all_files[2], delimiter = r'\s+', header = None, names = column_names, dtype = column_data_types)

## Preparing the datasets for the export

Replace -99.9 with NULL values

In [98]:
max_dataset.replace(-99.9, np.nan, inplace=True)
min_dataset.replace(-99.9, np.nan, inplace=True)
avg_dataset.replace(-99.9, np.nan, inplace=True)

Convert from F to C

In [99]:
months_F = column_names[1:]
months_C = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
for idx, month in enumerate(months_C):
  max_dataset[month] = ((max_dataset[months_F[idx]] - 32) * 5 / 9).round(1)
  min_dataset[month] = ((min_dataset[months_F[idx]] - 32) * 5 / 9).round(1)
  avg_dataset[month] = ((avg_dataset[months_F[idx]] - 32) * 5 / 9).round(1)

Extract *state_code* and *year* from *Code*

In [100]:
max_dataset['state_code'] = max_dataset['Code'].str[:3]
max_dataset['year'] = max_dataset['Code'].str[6:]

min_dataset['state_code'] = min_dataset['Code'].str[:3]
min_dataset['year'] = min_dataset['Code'].str[6:]

avg_dataset['state_code'] = avg_dataset['Code'].str[:3]
avg_dataset['year'] = avg_dataset['Code'].str[6:]

Create a dictionary with *state_code* as key and its corresponding *state* nomenclature as value

In [101]:
data = """001 Alabama
002 Arizona
003 Arkansas
004 California
005 Colorado
006 Connecticut
007 Delaware
008 Florida
009 Georgia
010 Idaho
011 Illinois
012 Indiana
013 Iowa
014 Kansas
015 Kentucky
016 Louisiana
017 Maine
018 Maryland
019 Massachusetts
020 Michigan
021 Minnesota
022 Mississippi
023 Missouri
024 Montana
025 Nebraska
026 Nevada
027 New Hampshire
028 New Jersey
029 New Mexico
030 New York
031 North Carolina
032 North Dakota
033 Ohio
034 Oklahoma
035 Oregon
036 Pennsylvania
037 Rhode Island
038 South Carolina
039 South Dakota
040 Tennessee
041 Texas
042 Utah
043 Vermont
044 Virginia
045 Washington
046 West Virginia
047 Wisconsin
048 Wyoming
050 Alaska
101 Northeast Region
102 East North Central Region
103 Central Region
104 Southeast Region
105 West North Central Region
106 South Region
107 Southwest Region
108 Northwest Region
109 West Region
110 National (contiguous 48 States)
111 Great Plains
115 Southern Plains and Gulf Coast
120 US Rockies and Westward
121 NWS Eastern Region
122 NWS Southern Region
123 NWS Central Region
124 NWS Western Region
201 Pacific Northwest Basin
202 California River Basin
203 Great Basin
204 Lower Colorado River Basin
205 Upper Colorado River Basin
206 Rio Grande River Basin
207 Texas Gulf Coast River Basin
208 Arkansas-White-Red Basin
209 Lower Mississippi River Basin
210 Missouri River Basin
211 Souris-Red-Rainy Basin
212 Upper Mississippi River Basin
213 Great Lakes Basin
214 Tennessee River Basin
215 Ohio River Basin
216 South Atlantic-Gulf Basin
217 Mid-Atlantic Basin
218 New England Basin
220 Mississippi River Basin & Tributaties (N. of Memphis, TN)
250 Spring Wheat Belt (area weighted)
255 Primary Hard Red Winter Wheat Belt (area weighted)
256 Winter Wheat Belt (area weighted)
260 Primary Corn and Soybean Belt (area weighted)
261 Corn Belt (area weighted)
262 Soybean Belt (area weighted)
265 Cotton Belt (area weighted)
350 Spring Wheat Belt (productivity weighted)
356 Winter Wheat Belt (productivity weighted)
361 Corn Belt (productivity weighted)
362 Soybean Belt (productivity weighted)
365 Cotton Belt (productivity weighted)
450 Spring Wheat Belt (% productivity in the Palmer Z Index)
456 Winter Wheat Belt (% productivity in the Palmer Z Index)
461 Corn Belt (% productivity in the Palmer Z Index)
462 Soybean Belt (% productivity in the Palmer Z Index)
465 Cotton Belt (% productivity in the Palmer Z Index)"""

lines = data.split('\n')
code_to_state = {line[:3]: line[4:].strip() for line in lines}
print(code_to_state)

{'001': 'Alabama', '002': 'Arizona', '003': 'Arkansas', '004': 'California', '005': 'Colorado', '006': 'Connecticut', '007': 'Delaware', '008': 'Florida', '009': 'Georgia', '010': 'Idaho', '011': 'Illinois', '012': 'Indiana', '013': 'Iowa', '014': 'Kansas', '015': 'Kentucky', '016': 'Louisiana', '017': 'Maine', '018': 'Maryland', '019': 'Massachusetts', '020': 'Michigan', '021': 'Minnesota', '022': 'Mississippi', '023': 'Missouri', '024': 'Montana', '025': 'Nebraska', '026': 'Nevada', '027': 'New Hampshire', '028': 'New Jersey', '029': 'New Mexico', '030': 'New York', '031': 'North Carolina', '032': 'North Dakota', '033': 'Ohio', '034': 'Oklahoma', '035': 'Oregon', '036': 'Pennsylvania', '037': 'Rhode Island', '038': 'South Carolina', '039': 'South Dakota', '040': 'Tennessee', '041': 'Texas', '042': 'Utah', '043': 'Vermont', '044': 'Virginia', '045': 'Washington', '046': 'West Virginia', '047': 'Wisconsin', '048': 'Wyoming', '050': 'Alaska', '101': 'Northeast Region', '102': 'East Nort

Add column *state* to the dataset using the dictionary

In [102]:
max_dataset['state'] = max_dataset.apply(lambda row: code_to_state[row['state_code']]
                                         if row['state_code'] in code_to_state
                                         else np.nan, axis=1)

min_dataset['state'] = min_dataset.apply(lambda row: code_to_state[row['state_code']]
                                         if row['state_code'] in code_to_state
                                         else np.nan, axis=1)

avg_dataset['state'] = avg_dataset.apply(lambda row: code_to_state[row['state_code']]
                                         if row['state_code'] in code_to_state
                                         else np.nan, axis=1)

Drop useless columns

In [103]:
max_dataset.drop(columns = ['Code', 'state_code'], inplace = True)
min_dataset.drop(columns = ['Code', 'state_code'], inplace = True)
avg_dataset.drop(columns = ['Code', 'state_code'], inplace = True)

## Export the .csv files

In [104]:
def create_csv(csv_name, dataset):

  '''
  csv_name = output filename
  dataset = temperature dataset from which data has to be read row by row
  '''

  with open(csv_name, 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    header = ['state', 'year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'JanF', 'FebF', 'MarF', 'AprF', 'MayF', 'JunF', 'JulF', 'AugF', 'SepF', 'OctF', 'NovF', 'DecF']
    writer.writerow(header)
    for idx in range(dataset.shape[0]):
      # row with: state, year, 12 months temperatures in C, 12 months temperatures in F
      row = [dataset.iloc[idx, 25]] + [int(dataset.iloc[idx, 24])] + list(dataset.iloc[idx, 12:24]) + list(dataset.iloc[idx, :12])
      writer.writerow(row)

In [105]:
state_list = list(code_to_state.values())[:49]

# Create .csv files with MAX temperatures for each state
for state in state_list:
  csv_name = str(state.replace(" ", "")) + 'MAX.csv'
  dataset = max_dataset[max_dataset['state'] == state]
  create_csv(csv_name, dataset)

# Create .csv files with MIN temperatures for each state
for state in state_list:
  csv_name = str(state.replace(" ", "")) + 'MIN.csv'
  dataset = min_dataset[min_dataset['state'] == state]
  create_csv(csv_name, dataset)

# Create .csv files with AVG temperatures for each state
for state in state_list:
  csv_name = str(state.replace(" ", "")) + 'AVG.csv'
  dataset = avg_dataset[avg_dataset['state'] == state]
  create_csv(csv_name, dataset)

# Load tree_dataset

Uncomment the following line of code to load the already pre-processed *tree_dataset* from google drive

In [None]:
# tree_dataset = pd.read_csv("/content/drive/MyDrive/tree_dataset/final_dataset/tree_dataset.csv", low_memory=False)

# Download ALL .csv files

In [106]:
directory_path = '/content'
file_type = ".csv"
files_to_download = glob.glob(f"{directory_path}/*{file_type}")
zip_filename = "datasets.zip"
with zipfile.ZipFile(zip_filename, "w") as zipf:
    for file in files_to_download:
        zipf.write(file, arcname=os.path.basename(file))
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>