# Notebook demonstrating data load from zip file

In [1]:
import pandas as pd
import zipfile

In [2]:
datafile = '../data/raw/synthetic_basic.zip'

In [3]:
zf = zipfile.ZipFile(datafile)

In [4]:
# Get the list of files in the archive: 
file_list = [ zipinfo.filename for zipinfo in zf.filelist
             if zipinfo.filename.endswith('.csv')]
print(file_list[:7])

['synthetic_basic/synthetic_basic_6.csv', 'synthetic_basic/synthetic_basic_44.csv', 'synthetic_basic/synthetic_basic_10.csv', 'synthetic_basic/synthetic_basic_29.csv', 'synthetic_basic/synthetic_basic_40.csv', 'synthetic_basic/synthetic_basic_43.csv', 'synthetic_basic/synthetic_basic_49.csv']


In [5]:
# Load any number of files into a list of dataframes:
dfs = []
for f in file_list[0:2]:
    print(f)
    df = pd.read_csv(zf.open(f))
    dfs.append(df)

synthetic_basic/synthetic_basic_6.csv
synthetic_basic/synthetic_basic_44.csv


In [6]:
dfs[0].head()

Unnamed: 0.1,Unnamed: 0,Power,POA,Tamb,Wind,Degradation_rate_per_yr,soiling
0,2015-01-01 00:00:00-05:00,-1.0,0.0,5.1,0.0,-0.00999,1
1,2015-01-01 00:01:00-05:00,-1.0,0.0,5.1,0.0,-0.00999,1
2,2015-01-01 00:02:00-05:00,-1.0,0.0,5.1,0.0,-0.00999,1
3,2015-01-01 00:03:00-05:00,-1.0,0.0,5.1,0.0,-0.00999,1
4,2015-01-01 00:04:00-05:00,-1.0,0.0,5.1,0.0,-0.00999,1


In [7]:
dfs[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2629440 entries, 0 to 2629439
Data columns (total 7 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Unnamed: 0               object 
 1   Power                    float64
 2   POA                      float64
 3   Tamb                     float64
 4   Wind                     float64
 5   Degradation_rate_per_yr  float64
 6   soiling                  int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 140.4+ MB


In [8]:
dfs[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2629440 entries, 0 to 2629439
Data columns (total 7 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Unnamed: 0               object 
 1   Power                    float64
 2   POA                      float64
 3   Tamb                     float64
 4   Wind                     float64
 5   Degradation_rate_per_yr  float64
 6   soiling                  int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 140.4+ MB


## Merging several samples into one dataframe

In [9]:
for i in range(len(dfs)):
    dfs[i]["id"] = i
    
dfs[1].head()

Unnamed: 0.1,Unnamed: 0,Power,POA,Tamb,Wind,Degradation_rate_per_yr,soiling,id
0,2015-01-01 00:00:00-05:00,-1.0,0.0,5.1,0.0,-0.01521,1,1
1,2015-01-01 00:01:00-05:00,-1.0,0.0,5.1,0.0,-0.01521,1,1
2,2015-01-01 00:02:00-05:00,-1.0,0.0,5.1,0.0,-0.01521,1,1
3,2015-01-01 00:03:00-05:00,-1.0,0.0,5.1,0.0,-0.01521,1,1
4,2015-01-01 00:04:00-05:00,-1.0,0.0,5.1,0.0,-0.01521,1,1


In [10]:
df_all = pd.concat(dfs)
df_all.head()

Unnamed: 0.1,Unnamed: 0,Power,POA,Tamb,Wind,Degradation_rate_per_yr,soiling,id
0,2015-01-01 00:00:00-05:00,-1.0,0.0,5.1,0.0,-0.00999,1,0
1,2015-01-01 00:01:00-05:00,-1.0,0.0,5.1,0.0,-0.00999,1,0
2,2015-01-01 00:02:00-05:00,-1.0,0.0,5.1,0.0,-0.00999,1,0
3,2015-01-01 00:03:00-05:00,-1.0,0.0,5.1,0.0,-0.00999,1,0
4,2015-01-01 00:04:00-05:00,-1.0,0.0,5.1,0.0,-0.00999,1,0


In [11]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5258880 entries, 0 to 2629439
Data columns (total 8 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Unnamed: 0               object 
 1   Power                    float64
 2   POA                      float64
 3   Tamb                     float64
 4   Wind                     float64
 5   Degradation_rate_per_yr  float64
 6   soiling                  int64  
 7   id                       int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 361.1+ MB
