Skip to content

Commit

Permalink
Solidify and test loading XML files using the processing queue
Browse files Browse the repository at this point in the history
  • Loading branch information
Phrancis committed Sep 6, 2016
1 parent c2d17ea commit 93f2ad6
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 35 deletions.
Expand Up @@ -10,7 +10,8 @@ CREATE TABLE RawDataXml.XmlProcessingQueue (
DataType NVARCHAR(256),
SiteDirectory NVARCHAR(256),
FilePath NVARCHAR(512),
Processed BIT DEFAULT 0
Processed BIT DEFAULT 0,
CONSTRAINT fk_XmlProcessingQueue_SiteId FOREIGN KEY (SiteId) REFERENCES CleanData.Sites(Id)
);
DECLARE
@RootDirectory NVARCHAR(256),
Expand Down Expand Up @@ -86,4 +87,17 @@ UPDATE RawDataXml.XmlProcessingQueue
SET SiteId = '00000000-0000-0000-0000-000000000001', ApiSiteParameter = 'meta.arabic'
WHERE FilePath LIKE '%stackexchange\meta.arabic.stackexchange.com%'

SELECT * FROM RawDataXml.XmlProcessingQueue ORDER BY ApiSiteParameter
SELECT * FROM RawDataXml.XmlProcessingQueue ORDER BY ApiSiteParameter

IF OBJECT_ID('RawDataXml.XmlProcessingLog') IS NOT NULL
DROP TABLE RawDataXml.XmlProcessingLog;
GO
CREATE TABLE RawDataXml.XmlProcessingLog (
SiteId UNIQUEIDENTIFIER,
ApiSiteParameter NVARCHAR(256),
SiteDirectory NVARCHAR(256),
FilePath NVARCHAR(512),
ProcessingTimeMs BIGINT NULL,
Processed DATETIME2 NOT NULL DEFAULT GETDATE(),
CONSTRAINT fk_XmlProcessingLog_SiteId FOREIGN KEY (SiteId) REFERENCES CleanData.Sites(Id)
);
@@ -0,0 +1,15 @@
-- Delete table if it exists already
IF OBJECT_ID('RawDataXml.XmlFiles') IS NOT NULL
DROP TABLE RawDataXml.XmlFiles;
GO
-- Create the table
CREATE TABLE RawDataXml.XmlFiles (
SiteId UNIQUEIDENTIFIER,
ApiSiteParameter NVARCHAR(256) NOT NULL,
DataType NVARCHAR(256) NOT NULL,
RawDataXml XML NULL,
SourceFilePath NVARCHAR(512) NOT NULL,
Inserted DATETIME2 DEFAULT GETDATE(),
CONSTRAINT fk_XmlFiles_SiteId FOREIGN KEY (SiteId) REFERENCES CleanData.Sites(Id)
);
GO
Expand Up @@ -10,20 +10,6 @@
USE SE
GO

-- Delete table if it exists already
IF OBJECT_ID('RawDataXml.Badges') IS NOT NULL
DROP TABLE RawDataXml.Badges;
GO
-- Create the table
CREATE TABLE RawDataXml.Badges (
SiteId UNIQUEIDENTIFIER,
ApiSiteParameter NVARCHAR(256) NOT NULL,
RawDataXml XML NULL,
Inserted DATETIME2 DEFAULT GETDATE(),
CONSTRAINT fk_Badges_SiteId FOREIGN KEY (SiteId) REFERENCES CleanData.Sites(Id)
);
GO

-- Delete table if it exists already
IF OBJECT_ID('CleanData.Badges') IS NOT NULL
DROP TABLE CleanData.Badges;
Expand Down
Expand Up @@ -12,6 +12,7 @@ GO

CREATE PROCEDURE RawDataXml.usp_LoadBadgesXml
@SiteDirectory NVARCHAR(256),
@FullFilePath NVARCHAR(512),
-- Delete the loaded XML file after processing if True/1 (default True):
@DeleteXmlRawDataAfterProcessing BIT = 1,
-- Display/Return results to caller if @ReturnRows is set to True (default False)
Expand All @@ -21,10 +22,6 @@ BEGIN
SET NOCOUNT ON;
-- Fetch global source path parameter:
DECLARE @SourcePath NVARCHAR(256);
DECLARE @bslash CHAR = CHAR(92);
SET @SourcePath = (SELECT Value FROM RawDataXml.Globals WHERE Parameter = 'SourcePath');
-- Make sure path ends with backslash (ASCII char 92)
IF(SELECT RIGHT(@SourcePath, 1)) <> @bslash SET @SourcePath += @bslash;

-- Fetch site identifiers based on @SiteDirectory parameter:
DECLARE @SiteId UNIQUEIDENTIFIER;
Expand All @@ -42,29 +39,26 @@ BEGIN
RAISERROR(@ErrMsg, 11, 1);
END

-- Delete any previous XML data that may be present for the site:
DELETE FROM RawDataXml.Badges
WHERE SiteId = @SiteId;

/** XML FILE HANDLING **
This section loads the XML file from the file system into a table.
If @DeleteXmlRawDataAfterProcessing is set to 1 (default)
this XML data will be deleted from the database (but not from the file system)
after the data is parsed into a relational table (below).
*****/

DECLARE @FilePath NVARCHAR(512) = @SourcePath + @SiteDirectory + @bslash + 'Badges.xml';
DECLARE @SQL_OPENROWSET_QUERY NVARCHAR(1024);

-- Dynamic SQL is used here because OPENROWSET will only accept a string literal as argument for the file path.
SET @SQL_OPENROWSET_QUERY =
'INSERT INTO RawDataXml.Badges (SiteId, ApiSiteParameter, RawDataXml)' + CHAR(10)
'INSERT INTO RawDataXml.XmlFiles (SiteId, ApiSiteParameter, DataType, RawDataXml, SourceFilePath)' + CHAR(10)
+ 'SELECT ' + QUOTENAME(@SiteId, '''') + ', ' + CHAR(10)
+ QUOTENAME(@ApiSiteParameter, '''') + ', ' + CHAR(10)
+ 'CONVERT(XML, BulkColumn) AS BulkColumn' + CHAR(10)
+ 'FROM OPENROWSET(BULK ' + QUOTENAME(@FilePath, '''') + ', SINGLE_BLOB) AS x;'
+ '''Badges'', ' + CHAR(10)
+ 'CONVERT(XML, BulkColumn) AS BulkColumn, ' + CHAR(10)
+ QUOTENAME(@FullFilePath, '''') + CHAR(10)
+ 'FROM OPENROWSET(BULK ' + QUOTENAME(@FullFilePath, '''') + ', SINGLE_BLOB) AS x;'

PRINT CONVERT(NVARCHAR(256), GETDATE(), 21) + ' Processing ' + @FilePath;
PRINT CONVERT(NVARCHAR(256), GETDATE(), 21) + ' Processing: ' + @FullFilePath;

-- Execute the dynamic query to load XML into the table:
EXECUTE sp_executesql @SQL_OPENROWSET_QUERY;
Expand All @@ -75,15 +69,11 @@ BEGIN
and ensure a "fresh" set of data.
*****/

-- Clear any existing data:
DELETE FROM CleanData.Badges
WHERE SiteId = @SiteId;

-- Prepare XML document for parsing:
DECLARE @XML AS XML;
DECLARE @Doc AS INT;
SELECT @XML = RawDataXml
FROM RawDataXml.Badges
FROM RawDataXml.XmlFiles
WHERE SiteId = @SiteId;
EXEC sp_xml_preparedocument @Doc OUTPUT, @XML;

Expand Down Expand Up @@ -125,7 +115,7 @@ BEGIN
-- Delete the loaded XML file after processing if True/1 (default True):
IF @DeleteXmlRawDataAfterProcessing = 1
BEGIN
DELETE FROM RawDataXml.Badges
DELETE FROM RawDataXml.XmlFiles
WHERE SiteId = @SiteId;
END

Expand Down
@@ -0,0 +1,49 @@
DECLARE @StartTime DATETIME2 = GETDATE()

DECLARE
@RowNum INT,
@SiteDirectory NVARCHAR(256),
@FullFilePath NVARCHAR(512),
@Now DATETIME2;

DECLARE _BadgesXmlProcessing CURSOR FOR
SELECT TOP 100
RowNum,
SiteDirectory,
FilePath
FROM RawDataXml.XmlProcessingQueue
WHERE DataType = 'Badges'
AND Processed = 0
ORDER BY RowNum ASC;

OPEN _BadgesXmlProcessing;

FETCH NEXT FROM _BadgesXmlProcessing INTO @RowNum, @SiteDirectory, @FullFilePath;

WHILE @@FETCH_STATUS = 0
BEGIN
SET @Now = GETDATE();
EXECUTE RawDataXml.usp_LoadBadgesXml @SiteDirectory, @FullFilePath;

INSERT INTO RawDataXml.XmlProcessingLog
SELECT SiteId, ApiSiteParameter, SiteDirectory, FilePath,
DATEDIFF(MILLISECOND, @Now, GETDATE()),
GETDATE()
FROM RawDataXml.XmlProcessingQueue
WHERE RowNum = @RowNum;
UPDATE RawDataXml.XmlProcessingQueue
SET Processed = 1
WHERE RowNum = @RowNum;

FETCH NEXT FROM _BadgesXmlProcessing INTO @RowNum, @SiteDirectory, @FullFilePath;
END

CLOSE _BadgesXmlProcessing;
DEALLOCATE _BadgesXmlProcessing;

--verify
SELECT DATEDIFF(SECOND, @StartTime, GETDATE()) AS [ProcessingTimeSeconds]
SELECT * FROM RawDataXml.XmlProcessingLog
SELECT * FROM CleanData.Badges ORDER BY ApiSiteParameter ASC, CreationDate ASC
SELECT COUNT(*) AS [BadgesXmlLeftToProcess]
FROM RawDataXml.XmlProcessingQueue WHERE DataType = 'Badges' AND Processed = 0;
4 changes: 4 additions & 0 deletions SQLScripts/2-Loading-and-parsing-XML-data/README.md
Expand Up @@ -52,6 +52,10 @@ Instructions:
2. Verify the displayed results.
3. If incorrect values are present, return to 0.2 and make corrections there first; then, run this again and the data should be corrected.

###0.4-Create_RawDataXml.XmlFiles_Table.sql

This will create the table that will temporarily hold XML files until they are parsed into relational table. It will be used by all the procedures/routines for loading and parsing XML files.

##1. Badges

Badges are awarded to users when they complete a certain activity or reach a certain milestone. The conditions vary for each Stack Exchange site. Click the __Badges__ link at the top of any Stack Exchange site for specific details.
Expand Down

0 comments on commit 93f2ad6

Please sign in to comment.