In [85]:
!pip install ccxt
import ccxt
from ccxt.base.errors import RequestTimeout

import pandas as pd 
import numpy as np 

import datetime
from datetime import timedelta
import time
import os
import glob



# 1. Download Collected P&Ds

In [87]:
binance_pds = pd.read_csv('мanually_collected_data/P&Ds.csv', sep = ';')

In [88]:
binance_pds

Unnamed: 0,Number_of_P&Ds,channel,coin,exchange,pair,timestamp,announced_growth
0,1,💎🚀 Hit Pump Angels 🚀💎,PIVX,binance,BTC,2021-03-28 17:00,"295,36%"
1,2,💎🚀 Hit Pump Angels 🚀💎,PPT,binance,BTC,2021-03-07 17:00,"457,85%"
2,3,💎🚀 Hit Pump Angels 🚀💎,MTH,binance,BTC,2021-03-01 17:00,"51,11%"
3,4,💎🚀 Hit Pump Angels 🚀💎,BRD,binance,BTC,2021-02-24 17:00,"86,36%"
4,5,💎🚀 Hit Pump Angels 🚀💎,RDN,binance,BTC,2021-02-22 17:00,"82,46%"
...,...,...,...,...,...,...,...
71,72,Big Pumps Binance,MDA,binance,BTC,2021-11-23 15:00,57%
72,73,Big Pumps Binance,SNM,binance,BTC,2022-01-27 15:02,17%
73,74,Big Pumps Binance,NAS,binance,BTC,2022-02-01 15:00,30%
74,75,Binance Crypto Pump Signals🚨🚨🚨,EVX,binance,BTC,2021-10-24 17:00,"83,68%"


Finally, there are 76 collected P&Ds. 

# 2. Download All Transaction For Each P&D

* To do so I will use ccxt library: https://github.com/ccxt/ccxt

In [95]:
binance = ccxt.binance()

def to_timestamp(dt):
    return binance.parse8601(dt.isoformat())

In [96]:
def download_binance(only_binance, days_before=2, days_after=2):
    '''
    Download all the transactions for all the pumps in binance in a given interval
    @param days_before: the number of days before the pump
    @param days_after: the number of days after the pump
    '''
    for i, pump in only_binance.iterrows():
        coin = str(pump['coin'])
        coin_pair = str(pump['coin']) + '/' + str(pump['pair'])
        date = pump['timestamp']
        pump_time = datetime.strptime(date, "%Y-%m-%d %H:%M")
        before = to_timestamp(pump_time - timedelta(days=days_before))
        after = to_timestamp(pump_time + timedelta(days=days_after))
        # to comment out
        import os
        if os.path.exists('data/{}_{}'.format(coin, str(date).replace(':', '.') + '.csv')):
            print(coin)
            continue
        #
        df = download(coin_pair, before, after)
        df.to_csv('data/{}_{}'.format(coin, str(date).replace(':', '.') + '.csv'), index=False)


In [97]:
def download(coin_pair, start, end):
    '''
    Download all the transaction for a given symbol from the start date to the end date
    @param symbol: the symbol of the coin for which download the transactions
    @param start: the start date from which download the transaction
    @param end: the end date from which download the transaction
    '''

    records = []
    since = start
    ten_minutes = 60000 * 10

    print('Downloading {} from {} to {}'.format(coin_pair, binance.iso8601(start), binance.iso8601(end)))

    while since < end:
        #print('since: ' + binance.iso8601(since)) #uncomment this line of code for verbose download
        try:
            orders = binance.fetch_trades(coin_pair, since)
        except RequestTimeout:
            time.sleep(5)
            orders = binance.fetch_trades(coin_pair, since)

        if len(orders) > 0:

            latest_ts = orders[-1]['timestamp']
            if since != latest_ts:
                since = latest_ts
            else:
                since += ten_minutes

            for l in orders:
                records.append({
                    'symbol': l['symbol'],
                    'timestamp': l['timestamp'],
                    'datetime': l['datetime'],
                    'side': l['side'],
                    'price': l['price'],
                    'amount': l['amount'],
                    'btc_volume': float(l['price']) * float(l['amount']),
                })
        else:
            since += ten_minutes

    return pd.DataFrame.from_records(records)

In [109]:
#from datetime import datetime   #--  use it to download [complete 2-nd part] and then delete, after that:
import datetime  #--  write it again to complete 3-rd part [that's because of the problems with datetime library]

In [108]:
download_binance(binance_pds, days_before=2, days_after=2)

PIVX
PPT
MTH
BRD
RDN
NXS
PNT
EVX
CTXC
MDA
NEBL
NXS
DLT
MTH
RCN
NAS
VIA
ONG
ARN
GRS
EDO
POA
NXS
RDN
NULS
SKY
FIO
CTXC
IDEX
RCN
ONG
IDEX
GVT
VIB
NEBL
GVT
ONG
VIB
OAX
QSP
QLC
PHB
APPC
NEBL
SNM
NEBL
PPT
VIA
DLT
OAX
WABI
MTH
NAS
BRD
FXS
GVT
BNT
APPC
MDA
NAS
DLT
IDEX
STEEM
WPR
FIO
NEBL
ATM
PIVX
WABI
BRD
NAS
MDA
SNM
NAS
EVX
DREP


# 3. Create DataFrame with Features and all Pumps Together

In [110]:
path = 'data/*.csv'

def std_rush_order_feature(df_buy, time_freq, rolling_freq):
    df_buy = df_buy.groupby(df_buy.index).count()
    df_buy[df_buy == 1] = 0
    df_buy[df_buy > 1] = 1
    buy_volume = df_buy.groupby(pd.Grouper(freq=time_freq))['btc_volume'].sum()
    buy_count = df_buy.groupby(pd.Grouper(freq=time_freq))['btc_volume'].count()
    buy_volume.drop(buy_volume[buy_count == 0].index, inplace=True)
    buy_volume.dropna(inplace=True)
    rolling_diff = buy_volume.rolling(window=rolling_freq).std()
    results = rolling_diff.pct_change()
    return results


def avg_rush_order_feature(df_buy, time_freq, rolling_freq):
    df_buy = df_buy.groupby(df_buy.index).count()
    df_buy[df_buy == 1] = 0
    df_buy[df_buy > 1] = 1
    buy_volume = df_buy.groupby(pd.Grouper(freq=time_freq))['btc_volume'].sum()
    buy_count = df_buy.groupby(pd.Grouper(freq=time_freq))['btc_volume'].count()
    buy_volume.drop(buy_volume[buy_count == 0].index, inplace=True)
    buy_volume.dropna(inplace=True)
    rolling_diff = buy_volume.rolling(window=rolling_freq).mean()
    results = rolling_diff.pct_change()
    return results


def std_trades_feature(df_buy_rolling, rolling_freq):
    buy_volume = df_buy_rolling['price'].count()
    buy_volume.drop(buy_volume[buy_volume == 0].index, inplace=True)
    buy_volume.dropna(inplace=True)
    rolling_diff = buy_volume.rolling(window=rolling_freq).std()
    results = rolling_diff.pct_change()
    return results


def std_volume_feature(df_buy_rolling, rolling_freq):
    buy_volume = df_buy_rolling['btc_volume'].sum()
    buy_volume.drop(buy_volume[buy_volume == 0].index, inplace=True)
    buy_volume.dropna(inplace=True)
    rolling_diff = buy_volume.rolling(window=rolling_freq).std()
    results = rolling_diff.pct_change()
    return results


def avg_volume_feature(df_buy_rolling, rolling_freq):
    buy_volume = df_buy_rolling['btc_volume'].sum()
    buy_volume.drop(buy_volume[buy_volume == 0].index, inplace=True)
    buy_volume.dropna(inplace=True)
    rolling_diff = buy_volume.rolling(window=rolling_freq).mean()
    results = rolling_diff.pct_change()
    return results


def std_price_feature(df_buy_rolling, rolling_freq):
    buy_volume = df_buy_rolling['price'].mean()
    buy_volume.dropna(inplace=True)
    rolling_diff = buy_volume.rolling(window=rolling_freq).std()
    results = rolling_diff.pct_change()
    return results


def avg_price_feature(df_buy_rolling):
    buy_volume = df_buy_rolling['price'].mean()
    buy_volume.dropna(inplace=True)
    rolling_diff = buy_volume.rolling(window=10).mean()
    results = rolling_diff.pct_change()
    return results


def avg_price_max(df_buy_rolling):
    buy_volume = df_buy_rolling['price'].max()
    buy_volume.dropna(inplace=True)
    rolling_diff = buy_volume.rolling(window=10).mean()
    results = rolling_diff.pct_change()
    return results


def chunks_time(df_buy_rolling):
    # compute any kind of aggregation
    buy_volume = df_buy_rolling['price'].max()
    buy_volume.dropna(inplace=True)
    #the index contains time info
    return buy_volume.index

def build_features(file, coin, time_freq, rolling_freq, index):
    df = pd.read_csv(file)
    df["time"] = pd.to_datetime(df['timestamp'].astype(np.int64), unit='ms')
    df = df.reset_index().set_index('time')
    #focus mainly on buy orders
    df_buy = df[df['side'] == 'buy']

    df_buy_grouped = df_buy.groupby(pd.Grouper(freq=time_freq))

    date = chunks_time(df_buy_grouped)

    results_df = pd.DataFrame(
        {'date': date,
         'pump_index': index,
         'std_rush_order': std_rush_order_feature(df_buy, time_freq, rolling_freq).values,
         'avg_rush_order': avg_rush_order_feature(df_buy, time_freq, rolling_freq).values,
         'std_trades': std_trades_feature(df_buy_grouped, rolling_freq).values,
         'std_volume': std_volume_feature(df_buy_grouped, rolling_freq).values,
         'avg_volume': avg_volume_feature(df_buy_grouped, rolling_freq).values,
         'std_price': std_price_feature(df_buy_grouped, rolling_freq).values,
         'avg_price': avg_price_feature(df_buy_grouped),
         'avg_price_max': avg_price_max(df_buy_grouped).values,
         'hour_sin': np.sin(2 * np.pi * date.hour/23),
         'hour_cos': np.cos(2 * np.pi * date.hour/23),
         'minute_sin': np.sin(2 * np.pi * date.minute / 59),
         'minute_cos': np.cos(2 * np.pi * date.minute / 59),
         })

    results_df['symbol'] = coin
    results_df['Was_Pump_or_no'] = 0
    return results_df.dropna()

In [111]:
files = glob.glob(path)
files

['data/WABI_2021-06-20 17.00.csv',
 'data/SNM_2022-01-27 15.02.csv',
 'data/SNM_2021-11-20 17.01.csv',
 'data/NEBL_2022-01-02 17.01.csv',
 'data/CTXC_2021-03-18 17.00.csv',
 'data/MDA_2021-01-03 17.00.csv',
 'data/IDEX_2021-01-17 17.00.csv',
 'data/DLT_2021-01-28 19.00.csv',
 'data/MTH_2021-03-01 17.00.csv',
 'data/ONG_2020-05-26 16.00.csv',
 'data/RDN_2021-02-22 17.00.csv',
 'data/ARN_2020-04-07 18.00.csv',
 'data/EVX_2021-01-09 21.00.csv',
 'data/NXS_2020-03-03 16.00.csv',
 'data/BRD_2021-10-31 15.00.csv',
 'data/IDEX_2020-12-31 17.00.csv',
 'data/VIB_2021-02-05 21.00.csv',
 'data/NULS_2020-01-30 17.00.csv',
 'data/NXS_2021-02-21 17.00.csv',
 'data/NAS_2020-05-30 16.00.csv',
 'data/NAS_2021-11-14 15.00.csv',
 'data/APPC_2020-11-18 18.00.csv',
 'data/FXS_2021-09-19 17.00.csv',
 'data/ONG_2021-01-11 16.00.csv',
 'data/BRD_2021-08-29 17.00.csv',
 'data/NEBL_2021-04-02 21.00.csv',
 'data/NXS_2021-03-03 16.00.csv',
 'data/SKY_2021-02-03 21.00.csv',
 'data/FIO_2021-06-13 17.01.csv',
 'data

In [112]:
def build_features_multi(time_freq, rolling_freq):

    files = glob.glob(path)

    all_results_df = pd.DataFrame()
    count = 0
    pumps = binance_pds

    for f in files:
        print(f)
        coin_date, time = os.path.basename(f[:f.rfind('.')]).split(' ')
        coin, date = coin_date.split('_')

        skip_pump = len(pumps[(pumps['coin'] == coin)]) == 0
        if skip_pump:
            continue

        results_df = build_features(f, coin, time_freq, rolling_freq, count)

        date_datetime = datetime.datetime.strptime(date + ' ' + time, '%Y-%m-%d %H.%M')

        # We consider 48 hours before and 48 hours after the pump, or 2 days in other words
        results_df = results_df[(results_df['date'] >= date_datetime - datetime.timedelta(hours=48)) & (results_df['date'] <= date_datetime + datetime.timedelta(hours=48))]

        all_results_df = pd.concat([all_results_df, results_df])
        count += 1

    all_results_df.to_csv('features/features_nolabaled_{}.csv'.format(time_freq), index=False, float_format='%.3f')


def compute_features():
    build_features_multi(time_freq='15S', rolling_freq=900)

In [113]:
compute_features()

data/WABI_2021-06-20 17.00.csv
data/SNM_2022-01-27 15.02.csv
data/SNM_2021-11-20 17.01.csv
data/NEBL_2022-01-02 17.01.csv
data/CTXC_2021-03-18 17.00.csv
data/MDA_2021-01-03 17.00.csv
data/IDEX_2021-01-17 17.00.csv
data/DLT_2021-01-28 19.00.csv
data/MTH_2021-03-01 17.00.csv
data/ONG_2020-05-26 16.00.csv
data/RDN_2021-02-22 17.00.csv
data/ARN_2020-04-07 18.00.csv
data/EVX_2021-01-09 21.00.csv
data/NXS_2020-03-03 16.00.csv
data/BRD_2021-10-31 15.00.csv
data/IDEX_2020-12-31 17.00.csv
data/VIB_2021-02-05 21.00.csv
data/NULS_2020-01-30 17.00.csv
data/NXS_2021-02-21 17.00.csv
data/NAS_2020-05-30 16.00.csv
data/NAS_2021-11-14 15.00.csv
data/APPC_2020-11-18 18.00.csv
data/FXS_2021-09-19 17.00.csv
data/ONG_2021-01-11 16.00.csv
data/BRD_2021-08-29 17.00.csv
data/NEBL_2021-04-02 21.00.csv
data/NXS_2021-03-03 16.00.csv
data/SKY_2021-02-03 21.00.csv
data/FIO_2021-06-13 17.01.csv
data/APPC_2021-12-19 15.00.csv
data/NEBL_2021-08-24 15.00.csv
data/PIVX_2021-09-18 15.00.csv
data/FIO_2021-02-04 17.00.csv

## Thus, I created dataset with 15S chunks and . However, it is not laballed yet with target variable

* During the analysis, I found that in some cases, pump and dumps started before or after the organizer shared the signal. To account for this discrepancy, I manually flag the chunk when the pump and dump actually started. Finally, the labelled data that will be used in further analysis can be found in "feature"-folder.

In [114]:
created_df = pd.read_csv('features/features_nolabaled_15S.csv')
created_df

Unnamed: 0,date,pump_index,std_rush_order,avg_rush_order,std_trades,std_volume,avg_volume,std_price,avg_price,avg_price_max,hour_sin,hour_cos,minute_sin,minute_cos,symbol,Was_Pump_or_no
0,2021-06-19 21:20:45,0,0.000,0.000,0.000,0.000,-0.000,-0.001,0.000,0.000,-0.520,0.854,0.848,-0.530,WABI,0
1,2021-06-19 21:21:15,0,0.000,0.000,0.000,0.000,-0.000,-0.001,-0.000,-0.000,-0.520,0.854,0.787,-0.618,WABI,0
2,2021-06-19 21:30:00,0,0.000,0.000,0.000,0.000,-0.000,-0.001,-0.000,-0.000,-0.520,0.854,-0.053,-0.999,WABI,0
3,2021-06-19 21:30:30,0,-0.000,-0.003,0.000,0.000,-0.000,-0.001,-0.000,-0.000,-0.520,0.854,-0.053,-0.999,WABI,0
4,2021-06-19 21:30:45,0,0.000,0.000,0.000,0.000,-0.000,-0.001,-0.000,-0.000,-0.520,0.854,-0.053,-0.999,WABI,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283248,2022-02-03 14:40:30,75,0.000,0.000,0.000,-0.000,0.000,0.000,-0.002,-0.002,-0.631,-0.776,-0.899,-0.437,NAS,0
283249,2022-02-03 14:43:45,75,0.000,0.000,0.000,-0.000,0.000,0.000,-0.002,-0.002,-0.631,-0.776,-0.991,-0.133,NAS,0
283250,2022-02-03 14:46:45,75,0.002,0.022,-0.001,0.000,-0.002,0.000,-0.002,-0.002,-0.631,-0.776,-0.983,0.185,NAS,0
283251,2022-02-03 14:53:45,75,0.000,0.000,-0.001,0.000,-0.001,0.000,-0.002,-0.002,-0.631,-0.776,-0.596,0.803,NAS,0


# Cleaning unlabelled datasets, Final stage

* Analyzing the created_df dataset it was found that not all pumps were collected properly. That's just because of the ccxt library problems that couldn't be solved. Thus, I cleaned it. So, only 43 pumps left. 

In [119]:
only_available_binance_pds = pd.read_csv('мanually_collected_data/P&Ds_available.csv', sep = ';')

In [120]:
only_available_binance_pds

Unnamed: 0,Number_of_P&Ds,channel,coin,exchange,pair,timestamp,announced_growth
0,1,💎🚀 Hit Pump Angels 🚀💎,PIVX,binance,BTC,2021-03-28 17:00,"295,36%"
1,2,💎🚀 Hit Pump Angels 🚀💎,PPT,binance,BTC,2021-03-07 17:00,"457,85%"
2,3,💎🚀 Hit Pump Angels 🚀💎,MTH,binance,BTC,2021-03-01 17:00,"51,11%"
3,4,💎🚀 Hit Pump Angels 🚀💎,BRD,binance,BTC,2021-02-24 17:00,"86,36%"
4,5,💎🚀 Hit Pump Angels 🚀💎,RDN,binance,BTC,2021-02-22 17:00,"82,46%"
5,6,💎🚀 Hit Pump Angels 🚀💎,NXS,binance,BTC,2021-02-21 17:00,"399,75%"
6,7,💎🚀 Hit Pump Angels 🚀💎,PNT,binance,BTC,2021-01-31 21:00,"1255,45%"
7,8,💎🚀 Hit Pump Angels 🚀💎,MDA,binance,BTC,2021-01-03 17:00,"56,64%"
8,9,💎🚀 Hit Pump Angels 🚀💎,RCN,binance,BTC,2021-01-07 21:00,"197,54%"
9,10,💎🚀 Hit Pump Angels 🚀💎,NAS,binance,BTC,2020-05-30 16:00,"93,90%"


In [138]:
binance_pds = only_available_binance_pds

In [168]:
def build_features_multi(time_freq, rolling_freq):

    files = glob.glob(path)

    all_results_df = pd.DataFrame()
    count = 0
    pumps = binance_pds

    for i, pump in pumps.iterrows():
        print(pump['coin'] + ' ' + pump['timestamp'])
        coin = pump['coin']
        
        date, time = pump['timestamp'].split(' ')
        hours, sec = time.split(':')
 
        skip_pump = len(pumps[(pumps['coin'] == coin)]) == 0
        if skip_pump:
            continue

        results_df = build_features('data/' + pump['coin'] + '_' + date + ' ' + hours + '.' + sec + '.csv', coin, time_freq, rolling_freq, count)

        date_datetime = datetime.datetime.strptime(pump['timestamp'], '%Y-%m-%d %H:%M')

        # We consider 48 hours before and 48 hours after the pump, or 2 days in other words
        results_df = results_df[(results_df['date'] >= date_datetime - datetime.timedelta(hours=48)) & (results_df['date'] <= date_datetime + datetime.timedelta(hours=48))]

        all_results_df = pd.concat([all_results_df, results_df])
        count += 1

    all_results_df.to_csv('features/features_nolabaled_and_available_{}.csv'.format(time_freq), index=False, float_format='%.3f')


def compute_features():
    build_features_multi(time_freq='15S', rolling_freq=900)

In [169]:
compute_features()

PIVX 2021-03-28 17:00
PPT 2021-03-07 17:00
MTH 2021-03-01 17:00
BRD 2021-02-24 17:00
RDN 2021-02-22 17:00
NXS 2021-02-21 17:00
PNT 2021-01-31 21:00
MDA 2021-01-03 17:00
RCN 2021-01-07 21:00
NAS 2020-05-30 16:00
VIA 2020-05-29 15:59
ONG 2020-05-26 16:00
ARN 2020-04-07 18:00
GRS 2020-04-02 16:00
EDO 2020-03-26 15:59
NULS 2020-01-30 17:00
SKY 2021-02-03 21:00
FIO 2021-02-04 17:00
CTXC 2021-03-18 17:00
RCN 2021-01-10 17:00
ONG 2021-01-11 16:00
GVT 2021-01-23 21:00
VIB 2021-02-05 21:00
NEBL 2021-02-13 21:00
GVT 2021-02-20 17:00
ONG 2021-04-12 17:00
VIB 2021-09-05 17:00
QSP 2020-09-10 18:00
QLC 2020-09-06 16:00
PPT 2020-06-08 16:06
OAX 2021-05-30 17:00
WABI 2021-06-20 17:00
NAS 2021-08-22 17:00
NAS 2021-01-14 21:00
IDEX 2021-04-25 17:00
STEEM 2021-01-18 17:00
WPR 2021-05-09 17:00
FIO 2021-06-13 17:01
NEBL 2021-08-24 15:00
ATM 2021-09-10 16:00
WABI 2021-10-14 15:00
MDA 2021-11-23 15:00
DREP 2021-07-25 17:00


## Finally we have unlabelled dataset with Pumps 

In [171]:
final_df = pd.read_csv('features/features_nolabaled_and_available_15S.csv')

In [172]:
final_df

Unnamed: 0,date,pump_index,std_rush_order,avg_rush_order,std_trades,std_volume,avg_volume,std_price,avg_price,avg_price_max,hour_sin,hour_cos,minute_sin,minute_cos,symbol,Was_Pump_or_no
0,2021-03-27 02:24:00,0,0.000,0.000,-0.000,-0.0,-0.002,-0.001,0.001,0.001,0.520,0.854,0.553,-0.833,PIVX,0
1,2021-03-27 02:24:15,0,0.003,0.003,0.001,-0.0,0.001,-0.000,0.001,0.001,0.520,0.854,0.553,-0.833,PIVX,0
2,2021-03-27 02:24:30,0,0.000,0.003,-0.000,-0.0,0.001,-0.000,0.001,0.001,0.520,0.854,0.553,-0.833,PIVX,0
3,2021-03-27 02:24:45,0,0.000,0.003,-0.000,-0.0,0.000,-0.000,0.001,0.001,0.520,0.854,0.553,-0.833,PIVX,0
4,2021-03-27 02:25:00,0,0.000,0.000,-0.000,-0.0,0.000,0.000,0.001,0.001,0.520,0.854,0.461,-0.887,PIVX,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203647,2021-07-27 16:54:00,42,0.000,0.000,0.000,0.0,0.000,0.002,-0.000,-0.000,-0.942,-0.335,-0.508,0.862,DREP,0
203648,2021-07-27 16:54:15,42,0.000,0.000,0.000,0.0,-0.000,0.002,-0.000,-0.000,-0.942,-0.335,-0.508,0.862,DREP,0
203649,2021-07-27 16:54:30,42,0.000,0.000,0.000,0.0,-0.000,0.002,-0.000,-0.000,-0.942,-0.335,-0.508,0.862,DREP,0
203650,2021-07-27 16:56:00,42,0.000,0.000,0.000,-0.0,0.000,0.002,-0.000,-0.000,-0.942,-0.335,-0.314,0.949,DREP,0


## So, I manually labelled this dataset, you can find it in the folder - 'feature' 
* It is called: features_labelled_and_available_final.csv

Here is the dataset that I will use in empirical part of research:

In [176]:
final_df = pd.read_csv('features/features_labelled_and_available_final.csv', sep = ';')

In [177]:
final_df

Unnamed: 0,date,pump_index,std_rush_order,avg_rush_order,std_trades,std_volume,avg_volume,std_price,avg_price,avg_price_max,hour_sin,hour_cos,minute_sin,minute_cos,symbol,Was_Pump_or_no
0,2021-03-27 02:24:00,0,0.000,0.000,-0.000,-0.0,-0.002,-0.001,0.001,0.001,0.520,0.854,0.553,-0.833,PIVX,0
1,2021-03-27 02:24:15,0,0.003,0.003,0.001,-0.0,0.001,-0.000,0.001,0.001,0.520,0.854,0.553,-0.833,PIVX,0
2,2021-03-27 02:24:30,0,0.000,0.003,-0.000,-0.0,0.001,-0.000,0.001,0.001,0.520,0.854,0.553,-0.833,PIVX,0
3,2021-03-27 02:24:45,0,0.000,0.003,-0.000,-0.0,0.000,-0.000,0.001,0.001,0.520,0.854,0.553,-0.833,PIVX,0
4,2021-03-27 02:25:00,0,0.000,0.000,-0.000,-0.0,0.000,0.000,0.001,0.001,0.520,0.854,0.461,-0.887,PIVX,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203647,2021-07-27 16:54:00,42,0.000,0.000,0.000,0.0,0.000,0.002,-0.000,-0.000,-0.942,-0.335,-0.508,0.862,DREP,0
203648,2021-07-27 16:54:15,42,0.000,0.000,0.000,0.0,-0.000,0.002,-0.000,-0.000,-0.942,-0.335,-0.508,0.862,DREP,0
203649,2021-07-27 16:54:30,42,0.000,0.000,0.000,0.0,-0.000,0.002,-0.000,-0.000,-0.942,-0.335,-0.508,0.862,DREP,0
203650,2021-07-27 16:56:00,42,0.000,0.000,0.000,-0.0,0.000,0.002,-0.000,-0.000,-0.942,-0.335,-0.314,0.949,DREP,0


Labelled Pumps

In [178]:
final_df[final_df['Was_Pump_or_no'] == 1]

Unnamed: 0,date,pump_index,std_rush_order,avg_rush_order,std_trades,std_volume,avg_volume,std_price,avg_price,avg_price_max,hour_sin,hour_cos,minute_sin,minute_cos,symbol,Was_Pump_or_no
2888,2021-03-28 17:00:00,0,8.095,0.741,7.876,23.256,2.728,0.13,0.086,0.128,-0.998,-0.068,0.0,1.0,PIVX,1
13751,2021-03-07 17:00:00,1,11.716,0.386,10.476,12.552,1.046,0.061,0.056,0.094,-0.998,-0.068,0.0,1.0,PPT,1
20567,2021-03-01 17:00:00,2,3.076,0.688,9.519,2.276,0.539,0.02,0.019,0.026,-0.998,-0.068,0.0,1.0,MTH,1
23158,2021-02-24 17:00:00,3,3.914,0.397,5.774,8.911,0.894,0.004,0.017,0.025,-0.998,-0.068,0.0,1.0,BRD,1
26485,2021-02-22 17:00:00,4,2.458,0.192,2.998,2.084,0.307,0.005,0.018,0.027,-0.998,-0.068,0.0,1.0,RDN,1
34492,2021-02-21 17:00:00,5,15.233,0.841,22.558,24.532,2.164,0.039,0.049,0.069,-0.998,-0.068,0.0,1.0,NXS,1
43657,2021-01-31 21:00:00,6,22.245,1.696,28.426,46.705,4.982,1.173,0.139,0.204,-0.52,0.854,0.0,1.0,PNT,1
53623,2021-01-03 17:00:15,7,0.693,0.092,1.257,1.346,0.226,0.008,0.009,0.011,-0.998,-0.068,0.0,1.0,MDA,1
56168,2021-01-07 21:00:00,8,11.258,1.231,33.089,26.108,2.44,0.508,0.111,0.16,-0.52,0.854,0.0,1.0,RCN,1
59840,2020-05-30 16:00:00,9,1.166,0.417,1.718,2.08,0.227,0.198,0.014,0.03,-0.942,-0.335,0.0,1.0,NAS,1
