-
Notifications
You must be signed in to change notification settings - Fork 0
/
Queries.txt
96 lines (78 loc) · 3.27 KB
/
Queries.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
######### SQL Server Table ############
CREATE TABLE [dbo].[delays](
[origin_city_name] [nvarchar](50) NOT NULL,
[weather_delay] float,
CONSTRAINT [PK_delays] PRIMARY KEY CLUSTERED
([origin_city_name] ASC))
GO
SELECT * FROM information_schema.tables
GO
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
######### Hive Tables and Query ############
DROP TABLE flightdelay_raw;
CREATE EXTERNAL TABLE flightdelay_raw(
YEAR string,
FL_DATE string,
UNIQUE_CARRIER string,
CARRIER string,
FL_NUM string,
ORIGIN_AIRPORT_ID string,
ORIGIN string,
ORIGIN_CITY_NAME string,
ORIGIN_CITY_NAME_TEMP string,
ORIGIN_STATE_ABR string,
DEST_AIRPORT_ID string,
DEST string,
DEST_CITY_NAME string,
DEST_CITY_NAME_TEMP string,
DEST_STATE_ABR string,
DEP_DELAY_NEW float,
ARR_DELAY_NEW float,
CARRIER_DELAY float,
WEATHER_DELAY float,
NAS_DELAY float,
SECURITY_DELAY float,
LATE_AIRCRAFT_DELAY float)
-- The following lines describe the format and location of the file
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/demo/FlightDelayData/InputData';
DROP TABLE flightdelay;
CREATE TABLE flightdelay AS
SELECT YEAR AS year,
FL_DATE AS flight_date,
substring(UNIQUE_CARRIER, 2, length(UNIQUE_CARRIER) -1) AS unique_carrier,
substring(CARRIER, 2, length(CARRIER) -1) AS carrier,
substring(FL_NUM, 2, length(FL_NUM) -1) AS flight_num,
ORIGIN_AIRPORT_ID AS origin_airport_id,
substring(ORIGIN, 2, length(ORIGIN) -1) AS origin_airport_code,
substring(ORIGIN_CITY_NAME, 2) AS origin_city_name,
substring(ORIGIN_STATE_ABR, 2, length(ORIGIN_STATE_ABR) -1) AS origin_state_abr,
DEST_AIRPORT_ID AS dest_airport_id,
substring(DEST, 2, length(DEST) -1) AS dest_airport_code,
substring(DEST_CITY_NAME,2) AS dest_city_name,
substring(DEST_STATE_ABR, 2, length(DEST_STATE_ABR) -1) AS dest_state_abr,
DEP_DELAY_NEW AS dep_delay_new,
ARR_DELAY_NEW AS arr_delay_new,
CARRIER_DELAY AS carrier_delay,
WEATHER_DELAY AS weather_delay,
NAS_DELAY AS nas_delay,
SECURITY_DELAY AS security_delay,
LATE_AIRCRAFT_DELAY AS late_aircraft_delay
FROM flightdelay_raw;
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
######### Hive Query to save data back to Gen 2 storage############
INSERT OVERWRITE DIRECTORY '/tutorials/flightdelays/output'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
SELECT regexp_replace(origin_city_name, '''', ''),
avg(weather_delay)
FROM delays
WHERE weather_delay IS NOT NULL
GROUP BY origin_city_name;
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
######### Sqoop commands ############
// SSH and Sqoop export
ssh eshantssh@clustername108-ssh.azurehdinsight.net
sqoop list-databases --connect jdbc:sqlserver://dbs108.database.windows.net:1433 --username eshant -P
sqoop export --connect 'jdbc:sqlserver://dbs108.database.windows.net:1433;database=db108' --username eshant -P --table 'delays' --export-dir '/demo/FlightDelayData/OutputData' --fields-terminated-by ',' -m 1