From d0bf21662d1d1344398d2b411cd87e7fd6f5d560 Mon Sep 17 00:00:00 2001
From: SakuraSound <CinnabarCourage@gmail.com>
Date: Thu, 23 Feb 2012 15:50:01 -0500
Subject: [PATCH] More tweaks to data and examples section...

---
 .gitignore                                    |  12 +-
 .gitignore~                                   |  10 ++
 .../content/new_systemdemonstration.tex       | 104 ++++++++----------
 paper/vldb12/madden.tex                       |   2 -
 4 files changed, 64 insertions(+), 64 deletions(-)
 create mode 100644 .gitignore~

diff --git a/.gitignore b/.gitignore
index 715e771..be5cf06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,9 @@
 docs/.DS_Store
 docs/Proposal/.DS_Store
 Crawlers/.pydevproject
-paper/sigmod12/madden.aux
-paper/sigmod12/madden.log
-paper/sigmod12/madden.pdf
-paper/sigmod12/madden.synctex.gz
-paper/sigmod12/madden.bbl
-paper/sigmod12/madden.blg
+paper/*/madden.aux
+paper/*/madden.log
+paper/*/madden.pdf
+paper/*/madden.synctex.gz
+paper/*/madden.bbl
+paper/*/madden.blg
diff --git a/.gitignore~ b/.gitignore~
new file mode 100644
index 0000000..715e771
--- /dev/null
+++ b/.gitignore~
@@ -0,0 +1,10 @@
+.project
+docs/.DS_Store
+docs/Proposal/.DS_Store
+Crawlers/.pydevproject
+paper/sigmod12/madden.aux
+paper/sigmod12/madden.log
+paper/sigmod12/madden.pdf
+paper/sigmod12/madden.synctex.gz
+paper/sigmod12/madden.bbl
+paper/sigmod12/madden.blg
diff --git a/paper/vldb12/content/new_systemdemonstration.tex b/paper/vldb12/content/new_systemdemonstration.tex
index 6e7db50..f043fd1 100644
--- a/paper/vldb12/content/new_systemdemonstration.tex
+++ b/paper/vldb12/content/new_systemdemonstration.tex
@@ -12,40 +12,29 @@ \subsection{Dataset for Example}
 Our sample demonstration for MADden involves a variety of NFL based data sources. 
 The data is represented in Table \ref{tab:madschema} as an abbreviated
 schema\footnote{These tables may be extracted to an RDBMS, or defined over an
-API using a foreign data wrapper (fdw).}.The {\tt NFLCorpus} table holds blogs
+API using a foreign data wrapper (fdw).}.
+
+The {\tt NFLCorpus} table holds blogs
 and news articles crawled from the web, as well as tweets extracted using the
 Twitter Streaming API with a series of NFL related keywords. These documents
 vary in size and quality, with some potential out-of domain documents (mainly
-tweets). The other tables are more structured, including {\tt
-$PlayerStats2011_*$}, with * indicating passing ({\tt Pass}), Receiving ({\tt
-Rec}), Rushing ({\tt Rush}), Special Teams ({\tt ST}), or Defense ({\tt Def}).
-
-
-
-
-
-. This table contains
-millions of documents, varying in size and quality. The type of document 
-can be specified via the type attribute. The types include `news', `blogs' or
-`tweets'. The \textit{entry} attribute of {\tt NFLCorpus} contains the text data.
-The other tables are structured but the data may contain duplicates or
-misspelled terms. It is also possible for these documents to be out of domain.
-The {\tt Players} table is a list of individuals
-who play in the NFL. The player data is obtained from NFL.com
-The {\tt PlayerStats} table has a \textit{position} field that specifies
-the type of statistic that is inside of the \textit{stats} column.
-The {\tt PlayerAlias} table list alternative names for players.
-While this list may be obtained using topic modeling or word
-co-occurrence, we hand coded a list of alias names for a select
-group of well-known players. This helps in resolving players who are
-referred to by their nickname, `Sticky Fingers', in the case of
-`Larry Fitzgerald'.
-The {\tt Teams} table holds all the team
-names in the NFL and the {\tt TeamAlias} table has the alternate names for these
-teams. ``ARI'' as an abbreviation,  and ``Football Cardinals'' as a nickname, are 
-example entries in this table. {\tt GameStats} contains the amount of points scored by a 
-home and away team for each NFL game played.\\
+tweets). Each document holds a \textit{doc\_id}, \textit{doc\_type},
+\textit{text}, associated tags, and document metadata.
 
+The other tables are more structured,
+including {\tt PlayerStats2011\_*}, with * indicating passing ({\tt Pass}), Receiving ({\tt
+Rec}), Rushing ({\tt Rush}), Special Teams ({\tt ST}), or Defense ({\tt Def}).
+This data was extracted from the NFL.com player database. Each table contains
+the player's \textit{name}, \textit{position}, \textit{number}, and a series of
+stats corresponding to the stat type (Some players show up in multiple tables, others in only one). The
+{\tt Player} table holds information about a player in the NFL, including
+\textit{college}, \textit{birthday}, \textit{height}, \textit{weight}, as well
+as \textit{years\_in\_NFL}. The {\tt Team} table holds some basic information
+about the 32 NFL teams, including \textit{location}, \textit{conference}, \textit{division}, and \textit{stadium}.
+{\tt TeamStats\_2011} holds the team rankings and stats in a vareity of categories (Offense, Defense, Special Teams,
+Points, etc.).
+{\tt Extracted\_Entities} can either be a view or table, and it stores the
+extracted entities found in the {\tt NFLCorpus} documents.
 
 \begin{table}
 \begin{center}
@@ -53,15 +42,17 @@ \subsection{Dataset for Example}
 \hline
 \multicolumn{2}{|c|}{Example Schema}\\
 \hline
-$match(target, against)$ & Entity Resolution\\
+$NFLCorpus$ & Documents table\\
 \hline
-$sentiment(text)$ & Sentiment Analysis\\
+$PlayerStats2011\_*$ & Player Stats tables for 2011\\
+\hline
+$Player$ & Basic Player information\\
 \hline
-$entity_find(text, boolean)$ & Detects Named Entities\\
+$Team\_Stats2011$ & Total Team stats and rankings\\
 \hline
-$pos_tag(text)$ & POS tagging\\
+$Team$ & Basic Team information \\
 \hline
-$pos_extract(text, type)$ & POS term extraction \\
+$Extracted\_Entities$ & View/Table of entities and documents \\
 \hline
 \end{tabular}
 \end{center}
@@ -92,11 +83,11 @@ \subsection{Text Analytics Queries}
 \begin{small}
 \begin{alltt}
 \textit{Q1: Entity Resolution}
-SELECT DISTINCT docid
+SELECT DISTINCT doc_id
 FROM extracted_entities
-WHERE match('Jaguars', entity) > .7
-   OR match('Dolphins', entity) > .7
-   OR match('Buccaneers', entity) > .7;
+WHERE match('Jaguars', entity) > match\_thresh
+   OR match('Dolphins', entity) > match\_thresh
+   OR match('Buccaneers', entity) > match\_thresh;
 \end{alltt}
 \end{small}
 %\end{lstlisting}
@@ -107,7 +98,7 @@ \subsection{Text Analytics Queries}
 using an Entity Recognition function on textual documents as they
 are added to the database (In this case, likely news articles and blogs). A
 table of all current text analysis functions can be seen in
-\ref{tab:madfunct}.\\
+\ref{tab:madfunct}. \\
 
 \begin{table}
 \begin{center}
@@ -119,11 +110,11 @@ \subsection{Text Analytics Queries}
 \hline
 $sentiment(text)$ & Sentiment Analysis\\
 \hline
-$entity_find(text, boolean)$ & Detects Named Entities\\
+$entity\_find(text, boolean)$ & Detects Named Entities\\
 \hline
-$pos_tag(text)$ & POS tagging\\
+$pos\_tag(text)$ & POS tagging\\
 \hline
-$pos_extract(text, type)$ & POS term extraction \\
+$pos\_extract(text, type)$ & POS term extraction \\
 \hline
 \end{tabular}
 \end{center}
@@ -145,9 +136,9 @@ \subsection{Text Analytics Queries}
 \textit{Q2: Entity Resolution and Sentiment Analysis}
 SELECT DISTINCT E.docid, E.entity, sentiment(S.document)
 FROM extracted_entities as E, NFLCorpus as S
-WHERE E.docid = S.docid
+WHERE E.doc_id = S.doc_id
   AND sentiment(S.document) in ('+', '-')
-  AND match('Jaguars', entity) > .7
+  AND match('Jaguars', E.entity) > match\_thresh
   AND S.type = 'tweet';
 \end{alltt}
 \end{small}
@@ -172,17 +163,19 @@ \subsection{Text Analytics Queries}
 %\begin{lstlisting}{language=SQL}
 \begin{small}
 \begin{alltt}
-\textit{Q3: Entity Resolution and Sentiment Analysis}
+\textit{Q3: Structured & Unstructured}
 SELECT BestWR.name, sentiment(A.txt)
-FROM NFLCorpus A, (SELECT P.fname || ' ' || P.lname as name
-                   FROM PlayerStats2011_Rec P
-                   WHERE (P.team = 'Jaguars' 
-                      OR P.team = 'Dolphins' 
-                      OR P.team ='Buccaneers') 
-                   ORDER BY P.rec_yds DESC, P.recs ASC
-                   LIMIT 1) as BestWR
-WHERE match(BestWR.name, A.txt) > .7
-  AND (A.type = 'blog' OR A.type = 'News');
+FROM NFLCorpus A, extracted_entities E,
+         (SELECT P.fname || ' ' || P.lname as name
+          FROM PlayerStats2011_Rec P
+          WHERE P.team = 'Jaguars' 
+             OR P.team = 'Dolphins' 
+             OR P.team = 'Buccaneers'
+          ORDER BY P.rec_yds DESC, P.recs ASC
+          LIMIT 1) as BestWR
+WHERE E.doc_id = A.doc_id 
+  AND (A.type = 'blog' OR A.type = 'news')
+  AND match(BestWR.name, E.entity) > match\_thresh;
 \end{alltt}
 \end{small}
 %\end{lstlisting}
@@ -198,7 +191,6 @@ \subsection{User Interface}
 for campaign management queries.
 
 
-
 For the demonstration, a web interface will be provided to interact with our
 system. It will utilize both a raw SQL UI, as well as a Mad
 Lib\footnote{http://en.wikipedia.org/wiki/Mad\_Libs} style interface. The raw
diff --git a/paper/vldb12/madden.tex b/paper/vldb12/madden.tex
index ff22feb..e2f19ef 100644
--- a/paper/vldb12/madden.tex
+++ b/paper/vldb12/madden.tex
@@ -39,8 +39,6 @@
 
 \input{content/systemdescription.tex}
 
-\input{content/data.tex}
-
 %\input{content/comparison.tex}
 
 \input{content/new_systemdemonstration.tex}