# Test interact with CLickhouse and HDFS

In [1]:
from clickhouse_driver import Client
import pandas as pd
import os
from dotenv import load_dotenv

## Clickhouse cloud

In [2]:
# load env
load_dotenv()

cloud=os.getenv("HOST")
user=os.getenv("USERNAME")
password=os.getenv("PASS")
port=9440

# Connect to database
conn = Client(host=cloud, port=port, user=user, password=password, secure='y',verify=False)
conn.execute("show databases")

[('INFORMATION_SCHEMA',),
 ('default',),
 ('information_schema',),
 ('mydb',),
 ('system',)]

In [15]:
df = pd.DataFrame([
    {'year': 1994, 'first_name': 'Vova'},
    {'year': 1995, 'first_name': 'Anja'},
    {'year': 1996, 'first_name': 'Vasja'},
    {'year': 1997, 'first_name': 'Petja'},
    {'year': 2000, 'first_name': 'Petja'},
    {'year': 2131, 'first_name': 'Petja'},
])

In [11]:
try:
    print('Create database mydb')
    conn.execute('create database if not exists mydb')
    print('done...')
    conn.execute('''
    create table if not exists mydb.test(
        year Int,
        first_name String
    ) engine = Memory
    ''')
    print('Created table mydb.test')
except Exception as e:
    print(e)

Create database mydb
done...
Created table mydb.test


In [12]:
try:
    row = conn.execute("insert into mydb.test values", df.to_dict('records'))
    print(f'done pussing {row} into test table')
except Exception as e:
    print(e)

done pussing 6 into test table


In [16]:
conn.execute('select * from mydb.test')

[(1994, 'Vova'),
 (1995, 'Anja'),
 (1996, 'Vasja'),
 (1997, 'Petja'),
 (2000, 'Petja'),
 (2131, 'Petja')]

## Local

In [2]:
conn = Client('localhost')

In [3]:
conn.execute("show databases")

[('INFORMATION_SCHEMA',),
 ('default',),
 ('information_schema',),
 ('system',),
 ('testdb',)]

In [6]:
try:
    print("create table ny_taxi_data")
    conn.execute("""
    CREATE TABLE IF NOT EXISTS yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP, 
	tpep_dropoff_datetime TIMESTAMP, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
) ENGINE = Memory
""")
    print('Done...')
except Exception as e:
    print(e)

create table ny_taxi_data
Done...


## HDFS

In [9]:
from hdfs3 import HDFileSystem
port = 8020
hdfs = HDFileSystem('localhost', port)
hdfs.ls('/')

[]

In [3]:
import dask.dataframe as dd
from pyarrow import fs
hdfs = fs.HadoopFileSystem('localhost', 8020)

OSError: Prior attempt to load libhdfs failed

In [6]:
hdfs.df()

{'capacity': 1081101176832, 'used': 45056, 'percent-free': 99.99999583239747}

In [2]:
hdfs.mkdir('hello')

In [6]:
hdfs.put('requirements.txt', '/requirements.txt')

OSError: Could not open file: /requirements.txt, mode: wb Permission denied: user=phonghuynh, access=WRITE, inode="/":root:supergroup:drwxr-xr-x

In [3]:
try:
    hdfs.put('test.txt','/test.txt')
except Exception as e:
    print(e)

2023-09-28 15:40:03.883973, p680, th139982298342464, ERROR Failed to setup the pipeline for new block [block pool ID: BP-435783800-172.19.0.4-1695890191831 block ID 1073741825_1001] file /test.txt.
Pipeline.cpp: 700: HdfsIOException: Cannot create block output stream for block [block pool ID: BP-435783800-172.19.0.4-1695890191831 block ID 1073741825_1001], recovery flag: false, with last generate stamp 0.
	@	Unknown
	@	Unknown
	@	Unknown
	@	Unknown
	@	Unknown
	@	Unknown
	@	Unknown
	@	Unknown
	@	ffi_closure_alloc
	@	dlfree
	@	dlmmap.constprop.0
	@	u_set
	@	Unknown
	@	_PyObject_MakeTpCall
	@	_PyEval_EvalFrameDefault
	@	_PyFunction_Vectorcall
	@	_PyEval_EvalFrameDefault
	@	_PyFunction_Vectorcall
	@	_PyEval_EvalFrameDefault
	@	_PyEval_EvalCodeWithName
	@	_PyFunction_Vectorcall
	@	method_vectorcall
	@	_PyEval_EvalFrameDefault
	@	_PyEval_EvalCodeWithName
	@	method_vectorcall
	@	_PyEval_EvalFrameDefault
	@	_PyEval_EvalCodeWithName
	@	PyEval_EvalCodeEx
	@	PyEval_EvalCode
	@	builtin_exec
	@	cfu

In [6]:
with hdfs.open("/test.txt", "rb") as file:
    data = file.read(1000)

data

2023-09-28 15:47:14.037917, p680, th139982298342464, ERROR cannot setup block reader for Block: [block pool ID: BP-435783800-172.19.0.4-1695890191831 block ID 1073741826_1002] file /test.txt on Datanode: 40d68dbd7a95(172.19.0.3).
RemoteBlockReader.cpp: 122: HdfsIOException: RemoteBlockReader: Failed to connect to 40d68dbd7a95(172.19.0.3)
	@	Unknown
	@	Unknown
	@	Unknown
	@	Unknown
	@	Unknown
	@	Unknown
	@	Unknown
	@	Unknown
	@	ffi_closure_alloc
	@	dlfree
	@	dlmmap.constprop.0
	@	u_set
	@	unpickle
	@	_PyObject_MakeTpCall
	@	_PyEval_EvalFrameDefault
	@	_PyEval_EvalCodeWithName
	@	method_vectorcall
	@	_PyEval_EvalFrameDefault
	@	_PyEval_EvalCodeWithName
	@	_PyFunction_Vectorcall
	@	_PyEval_EvalFrameDefault
	@	_PyEval_EvalCodeWithName
	@	PyEval_EvalCodeEx
	@	PyEval_EvalCode
	@	builtin_exec
	@	cfunction_vectorcall_FASTCALL
	@	_PyEval_EvalFrameDefault
	@	gen_send_ex
	@	_PyEval_EvalFrameDefault
	@	gen_send_ex
	@	_PyEval_EvalFrameDefault
	@	gen_send_ex
	@	method_vectorcall_O
	@	_PyEval_EvalFra