<a href="https://colab.research.google.com/github/SMRayeed/Paleo-Data-Classification/blob/main/01_Data_Collection_PIPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
from io import StringIO

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
folder_path = '/content/drive/MyDrive/PIPO/raw/'
files = os.listdir(folder_path)

In [4]:
drive_path_txt = '/content/drive/MyDrive/PIPO/processed/Files(.txt)/'
drive_path_csv = '/content/drive/MyDrive/PIPO/processed/Files(.csv)/'
os.makedirs(drive_path_txt) if not os.path.exists(drive_path_txt) else None
os.makedirs(drive_path_csv) if not os.path.exists(drive_path_csv) else None

In [5]:
min_info, avg_info, max_info, len_info, early, late = [], [], [], [], [], []

In [6]:
ix = 1

for file in files :

  print(" ====== S T A R T I N G ====== : ", file)

  file_path = folder_path + file

  with open(file_path, 'r') as file : text_data = file.read()

  site_name = text_data.split('Site_Name: ')[1].split('\n')[0].strip()
  location = text_data.split('Location: ')[1].split('\n')[0].strip()
  northernmost_latitude = float(text_data.split('Northernmost_Latitude: ')[1].split('\n')[0].strip())
  southernmost_latitude = float(text_data.split('Southernmost_Latitude: ')[1].split('\n')[0].strip())
  easternmost_longitude = float(text_data.split('Easternmost_Longitude: ')[1].split('\n')[0].strip())
  westernmost_longitude = float(text_data.split('Westernmost_Longitude: ')[1].split('\n')[0].strip())

  collection_name = text_data.split('Collection_Name: ')[1].split('\n')[0].strip()
  first_year = int((text_data.split('First_Year: ')[1].split('\n')[0].strip()) or (text_data.split('Earliest_Year: ')[1].split('\n')[0].strip()))
  last_year = int((text_data.split('Last_Year: ')[1].split('\n')[0].strip()) or (text_data.split('Most_Recent_Year: ')[1].split('\n')[0].strip()))

  early.append(first_year)
  late.append(last_year)
  species_name = text_data.split('Species_Name: ')[1].split('\n')[0].strip()
  common_name = text_data.split('Common_Name: ')[1].split('\n')[0].strip()
  tree_species_code = text_data.split('Tree_Species_Code: ')[1].split('\n')[0].strip()

  output_txt_file_name = f"{collection_name}.txt"
  output_txt_file_path = drive_path_txt + output_txt_file_name

  with open(output_txt_file_path, 'w') as output_file:
    output_file.write("Site Information\n")
    output_file.write("================\n")
    output_file.write(f"Site Name: {site_name}\n")
    output_file.write(f"Location: {location}\n")
    output_file.write(f"Northernmost Latitude: {northernmost_latitude}\n")
    output_file.write(f"Southernmost Latitude: {southernmost_latitude}\n")
    output_file.write(f"Easternmost Longitude: {easternmost_longitude}\n")
    output_file.write(f"Westernmost Longitude: {westernmost_longitude}\n")

    output_file.write("Data Collection\n")
    output_file.write("===============\n")
    output_file.write(f"Collection Name: {collection_name}\n")
    output_file.write(f"First Year: {first_year}\n")
    output_file.write(f"Last Year: {last_year}\n\n")

    output_file.write("Species Information\n")
    output_file.write("===================\n")
    output_file.write(f"Species Name: {species_name}\n")
    output_file.write(f"Common Name: {common_name}\n")
    output_file.write(f"Tree Species Code: {tree_species_code}\n")

  data_start_line = next(i for i, line in enumerate(text_data.strip().split('\n')) if line.startswith('age_CE') and not line.startswith('##'))
  df = pd.read_csv(StringIO(text_data), delim_whitespace=True, skiprows=data_start_line, index_col='age_CE')
  df.index.name = 'trees'
  df = df.transpose()
  df['site'] = site_name
  df['loc'] = location
  df['N'] = northernmost_latitude
  df['S'] = southernmost_latitude
  df['E'] = easternmost_longitude
  df['W'] = easternmost_longitude
  df['species'] = tree_species_code
  df = df.reset_index()

  output_csv_file_name = f"{collection_name}.csv"
  output_csv_file_path = drive_path_csv + output_csv_file_name
  df.to_csv(output_csv_file_path, index=False)

  non_nan_counts = df.iloc[:, :-7].notna().sum(axis=1)
  df_min_tree = non_nan_counts.idxmin()
  df_min = non_nan_counts[df_min_tree]
  df_max_tree = non_nan_counts.idxmax()
  df_max = non_nan_counts[df_max_tree]
  df_avg = non_nan_counts.mean()

  print(f"Min : {df_min} \t---> Tree : {df_min_tree}")
  print(f"Max : {df_max} \t---> Tree : {df_max_tree}")
  print(f"Avg : {df_avg:.4f}")

  min_info.append(df_min)
  avg_info.append(df_avg)
  max_info.append(df_max)
  len_info.append(len(df))

  print(f"====== D O N E ====== : {ix} of {len(files)} Files\n\n")

  ix += 1

Min : 52 	---> Tree : 19
Max : 180 	---> Tree : 0
Avg : 113.5200


Min : 101 	---> Tree : 18
Max : 167 	---> Tree : 25
Avg : 140.7308


Min : 121 	---> Tree : 14
Max : 259 	---> Tree : 2
Avg : 159.3810


Min : 51 	---> Tree : 3
Max : 398 	---> Tree : 58
Avg : 240.1333


Min : 63 	---> Tree : 20
Max : 369 	---> Tree : 10
Avg : 229.3750


Min : 48 	---> Tree : 16
Max : 367 	---> Tree : 30
Avg : 253.1304


Min : 91 	---> Tree : 44
Max : 247 	---> Tree : 40
Avg : 147.2391


Min : 11 	---> Tree : 35
Max : 288 	---> Tree : 0
Avg : 41.9683


Min : 111 	---> Tree : 0
Max : 111 	---> Tree : 0
Avg : 111.0000


Min : 315 	---> Tree : 1
Max : 373 	---> Tree : 3
Avg : 352.8333


Min : 316 	---> Tree : 1
Max : 491 	---> Tree : 2
Avg : 372.1667


Min : 66 	---> Tree : 14
Max : 305 	---> Tree : 7
Avg : 122.9423


Min : 121 	---> Tree : 13
Max : 480 	---> Tree : 3
Avg : 269.9000


Min : 124 	---> Tree : 1
Max : 266 	---> Tree : 0
Avg : 203.9231


Min : 72 	---> Tree : 4
Max : 164 	---> Tree : 0
Avg : 1

In [7]:
min_val = min(min_info)
max_val = max(min_info)
avg_val = sum(min_info) / len(min_info)
print(f"Min information : {min_val}")
print(f"Max information : {max_val}")
print(f"Avg information : {avg_val:.0f}")

Min information : 1
Max information : 316
Avg information : 113


In [8]:
min_early = min(early)
max_late = max(late)

In [9]:
print("Earliest First Date : ", min_early)
print("Latest Last Date    : ", max_late)

Earliest First Date :  532
Latest Last Date    :  2020


In [10]:
print(f"Total Trees : {sum(len_info)}")

Total Trees : 10141
