Skip to content

Commit

Permalink
Initial GitHub import
Browse files Browse the repository at this point in the history
  • Loading branch information
kylemarkwilliams committed Jul 11, 2013
0 parents commit e7e44b4
Show file tree
Hide file tree
Showing 2,004 changed files with 4,541,882 additions and 0 deletions.
50 changes: 50 additions & 0 deletions .gitignore
@@ -0,0 +1,50 @@
# Java
*.class
*~

# OS X
.DS_Store

# Sensitive configuration files that should never be committed
conf/applicationContext-mail.xml
conf/csx.config.properties
crawler/cdi/black_list.dat
crawler/cdi/runconfig.py
crawler/cdi/settings.py
web/citeseerx_oaiwebapp/WEB-INF/csx-oai.config.properties
web/citeseerx_webapp/WEB-INF/csx.config.properties
src/perl/BatchExtractor/bin/batchExtract.pl
src/perl/FileConversionService/lib/FileConverter/Config.pm

#LaTeX
*.acn
*.acr
*.alg
*.aux
*.bbl
*.blg
*.dvi
*.fdb_latexmk
*.glg
*.glo
*.gls
*.idx
*.ilg
*.ind
*.ist
*.lof
*.log
*.lot
*.maf
*.mtc
*.mtc0
*.nav
*.nlo
*.out
*.pdfsync
*.ps
*.snm
*.synctex.gz
*.vrb
*.xdy
*.tdo
8 changes: 8 additions & 0 deletions README.md
@@ -0,0 +1,8 @@
#CiteSeerX

This is the source code for the CiteSeerX academic digital library.

Most of the commits were made to an old SVN repository and have since been ported to git and imported into GitHub.

The code in the master branch should always be in a production-ready state so that we can deploy it at any time, while any other experimental code or code still in development should be in separate branches and merged into the master branch via pull requests.

16 changes: 16 additions & 0 deletions bin/batchImport
@@ -0,0 +1,16 @@
#!/bin/bash
#
# Imports document data into the repository. This
# utility expects all metadata to be pre-extracted in XML
# files alongside document files.
#
# Expects one or more arguments specifying the directories
# from which to import content.
#
# IGC
#

LOADER="edu.psu.citeseerx.loaders.BatchIngesterLoader"
BOOT_FILE="updates.txt"

source common
16 changes: 16 additions & 0 deletions bin/batchIngestTables
@@ -0,0 +1,16 @@
#!/bin/bash
#
# Imports document data into the repository. This
# utility expects all metadata to be pre-extracted in XML
# files alongside document files.
#
# Expects one or more arguments specifying the directories
# from which to import content.
#
# IGC
#

LOADER="edu.psu.citeseerx.loaders.TableIngesterLoader"
BOOT_FILE="updates.txt"

source common
15 changes: 15 additions & 0 deletions bin/buildChartData
@@ -0,0 +1,15 @@
#!/bin/bash
#
# Collect the data to build citation year histograms for documents, storing
# the data collected in the database in JSON format.
#
# The default call to this script will collect data for documents which
# citation count have change since the last time. If you need to collect data
# for all documents in the corpus use as a "all" parameter.
#
# Juan Pablo Fernandez Ramirez
#
LOADER="edu.psu.citeseerx.loaders.ChartDataBuilderLoader"
BOOT_FILE="updates.txt"

source common
12 changes: 12 additions & 0 deletions bin/buildCharts
@@ -0,0 +1,12 @@
#!/bin/bash
#
# Builds citation year histograms for documents, storing
# the output alongside documents in the repository.
#
# IGC
#

LOADER="edu.psu.citeseerx.loaders.CiteChartLoader"
BOOT_FILE="updates.txt"

source common
14 changes: 14 additions & 0 deletions bin/clusterAll
@@ -0,0 +1,14 @@
#!/bin/bash
#
# Clusters all documents and citations in the database.
# NOTE: This is only required if you want to re-cluster
# the data - do not run this without first destroying
# and rebuilding the csx_citegraph database!
#
# IGC
#

LOADER="edu.psu.citeseerx.loaders.ClusterLoader"
BOOT_FILE="updates.txt"

source common
57 changes: 57 additions & 0 deletions bin/common
@@ -0,0 +1,57 @@
#
# CSX Utility Harness
#
#
# IGC
#

DEFAULT_JAVA_HOME="/usr/java/jdk1.6.0_03_x64"

if [ ! $JAVA_HOME ]; then

# Allow JAVA_HOME default - should be set by user.
JAVA_HOME=$DEFAULT_JAVA_HOME

fi

if [ ! $CSX_HOME ]; then

# If the CSX_HOME variable is not set, let's assume
# this script is being called from the bin directory
# of the utility distribution.

PWD=`pwd`
export CSX_HOME=`echo $PWD | sed 's/^\(.*\)\/.*/\1/'`

fi

export CSX_LIB="$CSX_HOME/lib"
export CSX_BOOT="$CSX_HOME/bootstrap/$BOOT_FILE"
export CSX_CONF="$CSX_HOME/conf"
export CSX_DIST="$CSX_HOME/dist"
export JAVA_OPTS="-Xms1g -Xmx1g"
# Set the Java classpath

CSX_CLASSPATH="$CSX_CONF";
for file in `ls $CSX_DIST/lib`; do
CSX_CLASSPATH="$CSX_CLASSPATH:$CSX_DIST/lib/$file"
done
for file in `ls $CSX_LIB`; do
ext=${file##*.}
if [ "jar" = "$ext" ]; then
CSX_CLASSPATH="$CSX_CLASSPATH:$CSX_LIB/$file"
fi
done

export CSX_CLASSPATH="$CSX_CLASSPATH:$CLASSPATH"

echo $CSX_CONF

if [ -z "$JAVA_OPTS" ]; then
export JAVA_OPTS=""
fi

$JAVA_HOME/bin/java -cp $CSX_CLASSPATH \
$JAVA_OPTS \
-Dcsx.conf=$CSX_CONF -Dcsx.boot=$CSX_BOOT \
$LOADER $@
16 changes: 16 additions & 0 deletions bin/correctAuthors
@@ -0,0 +1,16 @@
#!/bin/bash
#
# Generates home page statistics.
#
# A path may be specified to use as the output directory
# for the generated stats data. Otherwise, a default
# of "stats" will be used or a custom default specified
# in configuration.
#
# Juan Pablo Fernandez Ramirez
#

LOADER="edu.psu.citeseerx.loaders.CorrectAuthorsLoader"
BOOT_FILE="updates.txt"

source common
85 changes: 85 additions & 0 deletions bin/createXML.pl
@@ -0,0 +1,85 @@
#!/usr/bin/perl -CSD
use strict;
use Encode;
use File::Spec;


my $repository = "rep1";

my ($importDir) = @ARGV;
if (!$importDir) {
print "Usage: $0 importDir\n";
exit;
}

sub fileWithoutExtension($) {
my $fileName = shift;
my ($volume,$directories,$file) = File::Spec->splitpath( $fileName );
$file=~s/(.*?)\.(.*)$/$1/;
return $file;
}

my $count = 0;
foreach my $i (<$importDir/*.txt>) {
if (! -e $i) {
print "$i does not exist !\n";
next;
}
my $docID = &fileWithoutExtension($i);
my $pdffile = $i; my $PDFfile = $i; my $PSfile=$i; my $psfile=$i;
$pdffile=~s/\.txt$/\.pdf/;
$PDFfile=~s/\.txt$/\.PDF/;
$PSfile=~s/\.txt$/\.PS/;
$psfile=~s/\.txt$/\.ps/;

my $filePath = "";
if( -e $pdffile) {
$filePath = $pdffile;
}
elsif(-e $PDFfile) {
$filePath = $PDFfile;
}
elsif(-e $PSfile) {
$filePath = $PSfile;
}
elsif(-e $psfile) {
$filePath = $psfile;
}
else {
#print "$docID: no pdf or ps\n";
next;
}
my $xml = "<document id=\"unset\">\n";
open(IN, "<:utf8", "$importDir/$docID.file") or next;
$xml .= "<fileInfo>\n";
$xml .= "<repository>$repository</repository>\n";
$xml .= "<filePath>$filePath</filePath>\n";
$xml .= "<bodyFile>$importDir/$docID.body</bodyFile>\n";
$xml .= "<citeFile>$importDir/$docID.cite</citeFile>\n";
while(<IN>) {
if (m/xml version/) {
next;
}
s/checksum/checkSum/g;
$xml .= $_;
};
close IN;
$xml .= "</fileInfo>\n";
open(IN, "<:utf8", "$importDir/$docID.header") or next;
while(<IN>) {
$xml .= $_;
}
close IN;
open(IN, "<:utf8", "$importDir/$docID.parscit") or next;
while(<IN>) {
$xml .= $_;
}
close IN;

$xml .= "</document>\n";

open (XML, ">:utf8", "$importDir/$docID.xml")
or die "$docID: could not open xml file for writing";
print XML $xml;
close XML;
}
16 changes: 16 additions & 0 deletions bin/genHomePageStats
@@ -0,0 +1,16 @@
#!/bin/bash
#
# Generates home page statistics.
#
# A path may be specified to use as the output directory
# for the generated stats data. Otherwise, a default
# of "stats" will be used or a custom default specified
# in configuration.
#
# Juan Pablo Fernandez Ramirez
#

LOADER="edu.psu.citeseerx.loaders.HomePageStatisticsGeneratorLoader"
BOOT_FILE="updates.txt"

source common
16 changes: 16 additions & 0 deletions bin/genStats
@@ -0,0 +1,16 @@
#!/bin/bash
#
# Generates citation, document, and author statistics files.
#
# A path may be specified to use as the output directory
# for the generated stats data. Otherwise, a default
# of "stats" will be used or a custom default specified
# in configuration.
#
# IGC
#

LOADER="edu.psu.citeseerx.loaders.StatisticsGeneratorLoader"
BOOT_FILE="updates.txt"

source common
10 changes: 10 additions & 0 deletions bin/legacyImport
@@ -0,0 +1,10 @@
#!/bin/bash

LOADER="edu.psu.citeseerx.ingestion.LegacyIngester"
BOOT_FILE="updates.txt"

source common


#/usr/java/jdk1.6.0_03_x64/bin/java -DCSX_HOME=$CSX_HOME -DCSX_CONF=$CSX_CONF \
# edu.psu.citeseerx.ingestion.LegacyIngester $1 $2 $3
9 changes: 9 additions & 0 deletions bin/legacyMetadataFixer
@@ -0,0 +1,9 @@
#!/bin/bash

LOADER="edu.psu.citeseerx.loaders.FixMetaDataLoader"
BOOT_FILE="updates.txt"

export JAVA_OPTS=" -Xmn100M -Xms700M -Xmx2000M"

source common

11 changes: 11 additions & 0 deletions bin/updateExtMetadata
@@ -0,0 +1,11 @@
#!/bin/bash
# Loads metadata from external sources to custom storage.
#
# Juan Pablo Fernandez Ramirez
#
LOADER="edu.psu.citeseerx.loaders.ExternalMetadataLoader"
BOOT_FILE="external-updates.txt"

export JAVA_OPTS=" -Xmn100M -Xms700M -Xmx2000M"

source common
9 changes: 9 additions & 0 deletions bin/updateExternalLinks
@@ -0,0 +1,9 @@
#!/bin/bash
# Loads metadata from external sources to custom storage.
#
# Juan Pablo Fernandez Ramirez
#
LOADER="edu.psu.citeseerx.loaders.ExternalLinkerLoader"
BOOT_FILE="updates.txt"

source common
6 changes: 6 additions & 0 deletions bin/updateIndex
@@ -0,0 +1,6 @@
#!/bin/bash

LOADER="edu.psu.citeseerx.loaders.IndexUpdateLoader"
BOOT_FILE="updates.txt"

source common
6 changes: 6 additions & 0 deletions bin/updateInference
@@ -0,0 +1,6 @@
#!/bin/bash

LOADER="edu.psu.citeseerx.loaders.InferenceUpdaterLoader"
BOOT_FILE="updates.txt"

source common
6 changes: 6 additions & 0 deletions bin/updateMCSUserIndex
@@ -0,0 +1,6 @@
#!/bin/bash

LOADER="edu.psu.citeseerx.myciteseer.loaders.UserIndexUpdateLoader"
BOOT_FILE="updates.txt"

source common

0 comments on commit e7e44b4

Please sign in to comment.