In [3]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
%matplotlib inline

# Preprocessing steps

- Read the 3 needed input files

- Find the columns of interest

- Keep only useful columns (Subset df)

- Convert Time --> to.datetime() for all 3

-  Set Time as Index

-  Convert all columns to Float (applymap)

-  Verify that everything looks good

-  Merge into one data frame (Merge by Time-Hour)

### Step 1: Read the 3 needed input files



In [163]:
data_dir = 'data/LNG_project'

In [164]:
_files = os.listdir(data_dir)

In [165]:
_files # Keep Only these 3 files in your data/LNG_project directory. Or adjust the path the read it.

['CC1_1_year_dataset_2017-2018.csv',
 'CC2_1_year_dataset_2017-2018.csv',
 'GT_1_year_updated.csv',
 'other']

In [166]:
csvfiles = [f for f in _files if f.endswith('csv')]
csvfiles

['CC1_1_year_dataset_2017-2018.csv',
 'CC2_1_year_dataset_2017-2018.csv',
 'GT_1_year_updated.csv']

In [167]:
assert(len(csvfiles) == 3) # Make sure you don't have extra files

In [168]:
dfs = [None] * 3 # empty list of data frames
for idx, f in enumerate(csvfiles):
    dfs[idx] = pd.read_csv(os.path.join(data_dir, f))

In [169]:
cc1 = dfs[0] #cc1 is a data frame. I am renaming dfs[0] to be cc1
cc2 = dfs[1]
gt = dfs[2] #data frame

In [170]:
for d in dfs:
    print(d.shape)

(9624, 30)
(9624, 26)
(9624, 25)


# Convert to_datetime

In [130]:
gt['Time'] = pd.to_datetime(gt['Time'])
cc1['Time'] = pd.to_datetime(cc1['Time'])
cc2['Time'] = pd.to_datetime(cc2['Time'])

# Set Time as the Index

In [None]:
for d in three_df_list:
    d.set_index("Time", inplace=True)

In [None]:
gt = gt.applymap(float)
cc1 = cc1.applymap(float)
cc2 = cc2.applymap(float)

In [175]:
verify_df_is_okay(cc2)

Index is  int64
Column dtypes are  [dtype('O') dtype('O') dtype('O') dtype('O') dtype('O') dtype('O')
 dtype('O') dtype('O') dtype('O') dtype('O') dtype('O') dtype('O')
 dtype('O') dtype('O') dtype('O') dtype('O') dtype('O') dtype('O')
 dtype('O') dtype('O') dtype('O') dtype('O') dtype('O') dtype('O')
 dtype('O')]


In [137]:
gt.columns = ["gt_"+x for x in gt.columns]
cc1.columns = ["cc1_"+x for x in cc1.columns]
cc2.columns = ["cc2_"+x for x in cc2.columns]

In [138]:
gt  =  gt.ffill()
cc1 = cc1.ffill()
cc2 = cc2.ffill()


# `pd.merge`

## How to Merge
In our case, we are going to merge by Index. In this case, the `Index` is `datetime`. So Pandas will automatically merge by the Dates (Hours).

In [None]:
merged_df = pd.merge(gt, cc1, left_index=True, right_index=True) #we are telling it to merge using the indices.

### We have already added 2 data frames. So we need to add the third Data Frame.
We take the partially merged and add the 3rd one to it.

In [None]:
merged_df = pd.merge(merged_df, cc2, left_index=True, right_index=True) # now add cc2 to it

In [141]:
merged_df.shape

(9318, 29)

In [142]:
merged_df

Unnamed: 0_level_0,gt_Power_by_torquemeter,gt_Flame_Detector_A_Sensor_Input,gt_Flame_Detector_B_Sensor_Input,gt_Gas_Generator_H_P_Shaft_Speed,gt_Power_Turbine_L_P_Shaft_Speed,gt_Compressor_Discharge_Pressure,gt_Gas_Generator_Inlet_Temperature,gt_Compressor_Discharge_Temp_Average_Temp,gt_Gas_Generator_Exhaust_Temperature_Average,gt_Power_Turbine_Exhaust_Temperature,...,cc1_Pressure_1st_side_stream,cc1_Compressor_Inlet_Temperature_from_Suction_Drum_In,cc1_Discharge_temperature_Out,cc2_Antisurge_Valve_Opening_feedback_(%_closed),cc2_Initial_Massflow,cc2_Phase_Mech_Total,cc2_Suction_Pressure_IN,cc2_Discharge_pressure_OUT,cc2_Suction_temperature_IN,cc2_Discharge_temperature_OUT
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01 00:00:00,34785.2422,-1.0,-1.0,9709.0859,6235.9116,23.7011,9.2327,491.7296,859.9678,525.8873,...,5.9215,,62.3687,0.0733,147.6530,9477.1982,0.1801,2.6471,-35.0042,11.7631
2017-01-01 01:00:00,34626.6406,-1.0,-1.0,9717.2305,6138.4844,23.6070,9.7178,491.8691,860.2543,526.9515,...,6.1165,,61.7827,0.0590,153.9981,9419.2334,0.2565,2.8083,-32.5472,12.2711
2017-01-01 02:00:00,34333.5078,-1.0,-1.0,9706.3125,6257.2759,23.4806,9.4469,489.4511,856.4915,523.0018,...,5.8099,,62.6016,0.0488,143.4317,9065.8232,0.1433,2.5495,-34.1481,11.8413
2017-01-01 03:00:00,34576.5234,-1.0,-1.0,9720.9434,6205.9551,23.5823,9.8513,492.1646,860.3738,526.6929,...,5.9690,,62.2654,0.0488,149.5239,9319.6396,0.1954,2.6803,-33.6557,11.8901
2017-01-01 04:00:00,34696.4336,-1.0,-1.0,9715.7246,6195.8408,23.6370,9.4243,491.9442,860.9113,526.4370,...,5.9946,,62.2222,0.0488,150.0141,9451.0010,0.2055,2.7022,-34.0855,11.9194
2017-01-01 05:00:00,34808.6758,-1.0,-1.0,9710.3672,6216.1055,23.6633,9.0541,491.2419,859.9618,525.7847,...,5.9690,,62.4176,0.0488,148.8775,9288.8691,0.1951,2.6813,-33.4730,12.0855
2017-01-01 06:00:00,34747.4023,-1.0,-1.0,9710.2500,6183.6973,23.6830,9.2195,491.4060,860.7247,526.4595,...,6.0190,,62.0383,0.0488,151.6010,9423.4736,0.2170,2.7350,-33.5026,11.9780
2017-01-01 07:00:00,34620.2656,-1.0,-1.0,9705.8057,6191.8862,23.6310,9.2883,490.9595,859.1617,525.5623,...,5.9959,,62.4493,0.0102,149.5860,9335.0830,0.2020,2.7044,-33.4139,12.1734
2017-01-01 08:00:00,34952.3555,-1.0,-1.0,9711.5166,6213.1533,23.7649,8.8154,491.2138,860.1328,525.6540,...,6.0100,,62.4664,0.0000,149.6668,9348.4844,0.1962,2.7009,-33.5214,12.1245
2017-01-01 09:00:00,34846.9336,-1.0,-1.0,9702.2520,6208.0332,23.7274,8.7558,490.4459,859.0355,525.2363,...,6.0434,,62.7652,0.0000,146.8040,9234.8936,0.1858,2.6885,-34.0049,11.9683


In [158]:
merged_df.to_csv("all3.csv") # Enjoy!

## Now you can trying to building a Linear Model.

