# **Files processing**

In [None]:
"""
This file have a programming code for organize, edit and process the original files, both raw (obtained in soho web page) and processing (of ESA web page) are in .txt but are converted in .parquet for to have a better performance in the analisys of dataset
"""

## **Importing files and libraries**

### Libraries

In [2]:
# For Data
import os
import pandas as pd
import numpy as np
from datetime import datetime

### Files

In [3]:
# Directory files

# Only update once

print("Actual Directory:", os.getcwd())

# Change path a main folder
os.chdir('../..')

print("New Directory:", os.getcwd())
#Directory: "\GitHub\Energy-and-linear-momentum-change-exerted-by-the-solar-wind-for-applications-in-electric-solar-sails"

Actual Directory: d:\lenovo\Documents\GitHub\Energy-and-linear-momentum-change-exerted-by-the-solar-wind-for-applications-in-electric-solar-sails\Codes\Past
New Directory: d:\lenovo\Documents\GitHub\Energy-and-linear-momentum-change-exerted-by-the-solar-wind-for-applications-in-electric-solar-sails


In [4]:
#Function for simply the files load
def dir(r_or_p, year):
    rp = ("Raw", "Calibrated",)
    r_or_p = r_or_p.strip().capitalize()
    if r_or_p not in rp: #Validate data
        raise NameError("Only Raw or Processed")

    if r_or_p == rp[0]:
      dir = f"Data/Raw/{year} Raw data.txt"
      return dir

    else:
      dir = f"Data/Processing/{year} final.txt"

      return dir

In [5]:
#Years that have a analysis
i_year = 1998
f_year = 2023
step_year = 5
other_year = 2000

years = [year for year in range(i_year, f_year + step_year, step_year)] #+ [other_year]

years.sort()

years

[1998, 2003, 2008, 2013, 2018, 2023]

#### Raw data

In [6]:
head_row = ["YY","MON", "DY","DOY:HH:MM:SS","SPEED","Np", "Vth","N/S","V_He", "GSE_X","GSE_Y","GSE_Z","RANGE","HGLAT","HGLONG","CRN(E)"]

#skip = [ i for i in range(0, 26, 1)]

Raw_data = {}
for year in years:
  Raw_data[year] = pd.read_csv(dir("Raw",year), sep="\s+")#, header=[1]), skiprows=skip
  Raw_data[year].columns = head_row


Raw_data

{1998:         YY  MON  DY  DOY:HH:MM:SS  SPEED     Np  Vth  N/S  V_He  GSE_X  GSE_Y  \
 0       98  Jan   1  001:00:00:30    361  10.08   32 -0.5   370  232.1  -93.4   
 1       98  Jan   1  001:00:01:00    358  10.21   31  0.0   366  232.1  -93.4   
 2       98  Jan   1  001:00:01:30    362  10.08   32 -0.0   371  232.1  -93.4   
 3       98  Jan   1  001:00:02:00    360  10.09   32 -0.3   369  232.1  -93.4   
 4       98  Jan   1  001:00:02:30    358  10.71   30 -0.7   366  232.1  -93.4   
 ...     ..  ...  ..           ...    ...    ...  ...  ...   ...    ...    ...   
 619716  98  Dec  21  355:17:27:16    318  13.51   23  5.3   320  233.8  -91.2   
 619717  98  Dec  21  355:17:27:46    317  13.47   23  5.1   320  233.8  -91.2   
 619718  98  Dec  21  355:17:28:16    319  13.08   23  5.1   322  233.8  -91.2   
 619719  98  Dec  21  355:17:28:46    319  11.60   23  5.0   322  233.8  -91.2   
 619720  98  Dec  21  355:17:29:16    324  10.40   23  4.5   327  233.8  -91.2   
 
        

#### Processed/ Calibrated data

In [7]:
head_row = ["YY","MON", "DY","DOY:HH:MM:SS","SPEED","Np", "Vth","N/S","V_He", "GSE_X","GSE_Y","GSE_Z","RANGE","HGLAT","HGLONG","CRN(E)"]

Calibrated_data = {}
for year in years:
  Calibrated_data[year] = pd.read_csv(dir("Calibrated",year), sep="\s+")
  Calibrated_data[year].columns = head_row


Calibrated_data

{1998:        YY  MON  DY  DOY:HH:MM:SS  SPEED     Np  Vth  N/S  V_He  GSE_X  GSE_Y  \
 0      98  Jan   1  001:00:05:02    371  10.08   35 -0.8   383  232.0  -93.4   
 1      98  Jan   1  001:00:10:03    371   9.80   35 -0.9   383  232.0  -93.4   
 2      98  Jan   1  001:00:15:05    370   9.00   36  0.3   383  232.0  -93.4   
 3      98  Jan   1  001:00:20:11    372   7.25   38  1.5   387  232.0  -93.4   
 4      98  Jan   1  001:00:25:10    370   8.36   37  0.3   384  232.0  -93.4   
 ...    ..  ...  ..           ...    ...    ...  ...  ...   ...    ...    ...   
 62155  98  Dec  21  355:17:07:38    336  15.01   24  5.6   338  233.8  -91.2   
 62156  98  Dec  21  355:17:12:39    332  14.50   23  3.8   335  233.8  -91.2   
 62157  98  Dec  21  355:17:17:41    334  13.97   24  3.9   337  233.8  -91.2   
 62158  98  Dec  21  355:17:22:43    334  14.58   23  4.7   337  233.8  -91.2   
 62159  98  Dec  21  355:17:27:46    333  13.76   23  4.9   335  233.8  -91.2   
 
        GSE_Z  RANGE

## **Data Analysis**

In [8]:
#### Time
def sec(yy, doy_hh_mm_ss, initial): #Function for obtain total seconds
    # Convert string to datetime format
    datofeo = ["0", "3", "8"]
    if str(yy) not in datofeo:
      date = datetime.strptime(str(yy) +":" + str(doy_hh_mm_ss), '%y:%j:%H:%M:%S')
    else:
      date = datetime.strptime("0" + str(yy) +":" + str(doy_hh_mm_ss), '%y:%j:%H:%M:%S')

    # Calculate the difference from the reference datetime
    timedelta = date - datetime(1900, 1, 1)

    # Total seconds
    seconds = timedelta.total_seconds() - 3092601600 # - 98 years and 24 days

    return seconds #Return float of the total seconds

def Seconds_data(data, initial):
  time_sec = data['DOY:HH:MM:SS'] # Rename time col for simplicity
  year = data["YY"]
  seconds_time = [sec(year[i], time_sec[i], initial) for i in range(0, len(time_sec), 1)] # Obtain each total seconds in all dataframe
  data.insert(4,"Seconds",seconds_time) # Insert total seconds to dataframe

In [9]:
Secondszeroraw = 0
Secondszerocalibrated = 0
for year in years:
  Seconds_data(Raw_data[year], Secondszeroraw)
  Seconds_data(Calibrated_data[year], Secondszerocalibrated)
  #Secondszeroraw += Raw_data[year]["Seconds"][len(Raw_data[year])-1]
  #Secondszerocalibrated += Calibrated_data[year]["Seconds"][len(Calibrated_data[year])-1]

In [10]:
"""
# Concat all data (WARNING: High spend of RAM memory)

Raw = []
Raw = pd.concat(Raw_data, axis = 0)
Raw

# Concat all data (WARNING: High spend of RAM memory)

Calibrated = []
Calibrated = pd.concat(Calibrated_data, axis = 0)
Calibrated

"""



## **Create Files**

In [11]:
# Create folders
path_raw = 'Data\Raw_converter'
path_calibrated = 'Data\Processing_converter'

paths = [path_raw, path_calibrated]

for path in paths:
   if not os.path.exists(path):
     os.mkdir(path)
     print("Directorio %s creado!" % path)
   else:
     print("Directorio %s ya existe" % path)

Directorio Data\Raw_converter ya existe
Directorio Data\Processing_converter ya existe


In [12]:
# Create .parquet files
for year in years:
  #Calibrated_data[year].to_csv(f'Data\Processing_converter\{year}.csv')
  #Raw_data[year].to_csv(f'Data\Raw_converter\{year}.csv')
  Calibrated_data[year].to_parquet(f'Data\Processing_converter\{year}.parquet', compression='gzip')
  Raw_data[year].to_parquet(f'Data\Raw_converter\{year}.parquet', compression='gzip')
  #h = pd.read_csv("2023.csv")
  #h.drop(columns = ["Unnamed: 0"])