## Cleaning the data

In [None]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
df = pd.read_csv(r'../data/raw/btcusdt_1d.csv')
df.head()

Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,num_trades,taker_base_volume,taker_quote_volume,ignore
0,1677196800000,23940.2,24132.35,22841.19,23185.29,343582.57453,1677283199999,8087524000.0,11531424,170263.13353,4008150000.0,0
1,1677283200000,23184.04,23219.13,22722.0,23157.07,191311.8101,1677369599999,4406286000.0,9124568,94440.13964,2175284000.0,0
2,1677369600000,23157.07,23689.99,23059.18,23554.85,202323.73623,1677455999999,4716158000.0,9506015,101003.39278,2354572000.0,0
3,1677456000000,23554.85,23897.99,23106.77,23492.09,283706.0859,1677542399999,6659786000.0,11754195,141360.04845,3318354000.0,0
4,1677542400000,23492.09,23600.0,23020.97,23141.57,264140.99894,1677628799999,6172931000.0,9568743,131200.82704,3066250000.0,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   open_time           1000 non-null   int64  
 1   open                1000 non-null   float64
 2   high                1000 non-null   float64
 3   low                 1000 non-null   float64
 4   close               1000 non-null   float64
 5   volume              1000 non-null   float64
 6   close_time          1000 non-null   int64  
 7   quote_asset_volume  1000 non-null   float64
 8   num_trades          1000 non-null   int64  
 9   taker_base_volume   1000 non-null   float64
 10  taker_quote_volume  1000 non-null   float64
 11  ignore              1000 non-null   int64  
dtypes: float64(8), int64(4)
memory usage: 93.9 KB


#### Preprocessing
##### converting timestamp to datetime

In [10]:
df['open_time']=pd.to_datetime(df['open_time'], unit='ms')
df['close_time']=pd.to_datetime(df['close_time'], unit='ms')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open_time           1000 non-null   datetime64[ns]
 1   open                1000 non-null   float64       
 2   high                1000 non-null   float64       
 3   low                 1000 non-null   float64       
 4   close               1000 non-null   float64       
 5   volume              1000 non-null   float64       
 6   close_time          1000 non-null   datetime64[ns]
 7   quote_asset_volume  1000 non-null   float64       
 8   num_trades          1000 non-null   int64         
 9   taker_base_volume   1000 non-null   float64       
 10  taker_quote_volume  1000 non-null   float64       
 11  ignore              1000 non-null   int64         
dtypes: datetime64[ns](2), float64(8), int64(2)
memory usage: 93.9 KB


##### converting to numeric

In [12]:
# Convert to numeric
numeric_cols = ["open", "high", "low", "close", "volume"]
df[numeric_cols] = df[numeric_cols].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open_time           1000 non-null   datetime64[ns]
 1   open                1000 non-null   float64       
 2   high                1000 non-null   float64       
 3   low                 1000 non-null   float64       
 4   close               1000 non-null   float64       
 5   volume              1000 non-null   float64       
 6   close_time          1000 non-null   datetime64[ns]
 7   quote_asset_volume  1000 non-null   float64       
 8   num_trades          1000 non-null   int64         
 9   taker_base_volume   1000 non-null   float64       
 10  taker_quote_volume  1000 non-null   float64       
 11  ignore              1000 non-null   int64         
dtypes: datetime64[ns](2), float64(8), int64(2)
memory usage: 93.9 KB


##### dropping unused columns

In [13]:
df = df.drop(columns='ignore', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   open_time           1000 non-null   datetime64[ns]
 1   open                1000 non-null   float64       
 2   high                1000 non-null   float64       
 3   low                 1000 non-null   float64       
 4   close               1000 non-null   float64       
 5   volume              1000 non-null   float64       
 6   close_time          1000 non-null   datetime64[ns]
 7   quote_asset_volume  1000 non-null   float64       
 8   num_trades          1000 non-null   int64         
 9   taker_base_volume   1000 non-null   float64       
 10  taker_quote_volume  1000 non-null   float64       
dtypes: datetime64[ns](2), float64(8), int64(1)
memory usage: 86.1 KB


### Pushing to processed folder

In [14]:
df.to_csv('../data/processed/btcusdt_1d.csv', index=False)

##### sorting values

In [None]:
# df = df.sort_values("open_time").reset_index(drop=True)

##### removing duplicates

In [None]:
# data = data.drop_duplicates(subset=["open_time"])
# data.info()