From 2cd1d7dc1520f2809f845c7d0c8014092b06c7cc Mon Sep 17 00:00:00 2001 From: mtkachenko Date: Tue, 4 Dec 2018 14:27:49 +0800 Subject: [PATCH] Migrated --- data/amazon.csv | 61 ++++++ data/amazon_extended.csv | 61 ++++++ data/camera.csv | 13 ++ data/icecream.csv | 31 +++ data/icecream_raw.csv | 31 +++ pom.xml | 124 +++++++++++ .../preferred/regression/ApplyRegression.java | 66 ++++++ .../java/ai/preferred/regression/Command.java | 58 +++++ .../regression/EvaluateRegression.java | 68 ++++++ .../ai/preferred/regression/PlotData.java | 44 ++++ .../regression/PlotLinearRegression.java | 67 ++++++ .../preferred/regression/PrintRegression.java | 110 ++++++++++ .../java/ai/preferred/regression/Shell.java | 146 ++++++++++++ .../regression/TrainLinearRegression.java | 54 +++++ .../regression/TrainLogisticRegression.java | 62 ++++++ .../ai/preferred/regression/WekaUtils.java | 25 +++ .../regression/exercise/E00_IceCream.java | 23 ++ .../E01_MyFirstRegressionWithIceCream.java | 24 ++ .../exercise/E02_ReadingRegression.java | 24 ++ .../regression/exercise/E03_RawIceCream.java | 25 +++ .../E04_RegressionForTemperature.java | 20 ++ .../regression/exercise/E05_TryX2Only.java | 23 ++ .../exercise/E06_TryX1AndX2AndX3.java | 22 ++ .../regression/exercise/E07_AmazonText.java | 23 ++ .../regression/exercise/E08_AmazonCheap.java | 22 ++ .../exercise/E09_AmazonExpensive.java | 19 ++ .../exercise/E10_AmazonYourOwnWord.java | 20 ++ .../exercise/E11_CameraCategories.java | 23 ++ .../exercise/E12_CameraWithAutoFocus.java | 23 ++ .../E13_ShuffleAndPartitionIceCream.java | 26 +++ .../regression/exercise/E14_TrainTest.java | 23 ++ .../regression/exercise/E15_TestX123.java | 21 ++ .../exercise/E16_AmazonTrainTest.java | 21 ++ .../exercise/E17_AmazonLogistic.java | 22 ++ .../exercise/E18_LogisticRidgeRegression.java | 23 ++ .../exercise/E19_LinearRidgeRegression.java | 22 ++ .../regression/exercise/E20_GrandFinale.java | 19 ++ .../regression/io/ARFFDataReader.java | 113 ++++++++++ .../preferred/regression/io/CSVInputData.java | 132 +++++++++++ .../ai/preferred/regression/io/CSVUtils.java | 35 +++ .../ai/preferred/regression/pe/AddX2.java | 40 ++++ .../ai/preferred/regression/pe/AddX3.java | 41 ++++ .../ai/preferred/regression/pe/Dummy.java | 42 ++++ .../regression/pe/EncodeTextAsFrequency.java | 121 ++++++++++ .../regression/pe/EncodeValueAsOneHot.java | 61 ++++++ .../ai/preferred/regression/pe/Partition.java | 42 ++++ .../regression/pe/ProcessingElement.java | 88 ++++++++ .../regression/pe/ProjectColumns.java | 61 ++++++ .../preferred/regression/pe/RemoveColumn.java | 37 ++++ .../preferred/regression/pe/SelectEquals.java | 39 ++++ .../ai/preferred/regression/pe/Shuffle.java | 36 +++ .../preferred/regression/pe/SwapColumns.java | 41 ++++ .../regression/pe/data/Vocabulary.java | 50 +++++ .../ai/preferred/regression/plot/XYChart.java | 41 ++++ .../preferred/regression/reset/DataFiles.java | 207 ++++++++++++++++++ src/main/resources/log4j.properties | 22 ++ src/test/java/.gitkeep | 0 src/test/resources/.gitkeep | 0 temp/.gitkeep | 0 utils/assemble.py | 38 ++++ 60 files changed, 2776 insertions(+) create mode 100644 data/amazon.csv create mode 100644 data/amazon_extended.csv create mode 100644 data/camera.csv create mode 100644 data/icecream.csv create mode 100644 data/icecream_raw.csv create mode 100644 pom.xml create mode 100644 src/main/java/ai/preferred/regression/ApplyRegression.java create mode 100644 src/main/java/ai/preferred/regression/Command.java create mode 100644 src/main/java/ai/preferred/regression/EvaluateRegression.java create mode 100644 src/main/java/ai/preferred/regression/PlotData.java create mode 100644 src/main/java/ai/preferred/regression/PlotLinearRegression.java create mode 100644 src/main/java/ai/preferred/regression/PrintRegression.java create mode 100644 src/main/java/ai/preferred/regression/Shell.java create mode 100644 src/main/java/ai/preferred/regression/TrainLinearRegression.java create mode 100644 src/main/java/ai/preferred/regression/TrainLogisticRegression.java create mode 100644 src/main/java/ai/preferred/regression/WekaUtils.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E00_IceCream.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E01_MyFirstRegressionWithIceCream.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E02_ReadingRegression.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E03_RawIceCream.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E04_RegressionForTemperature.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E05_TryX2Only.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E06_TryX1AndX2AndX3.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E07_AmazonText.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E08_AmazonCheap.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E09_AmazonExpensive.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E10_AmazonYourOwnWord.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E11_CameraCategories.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E12_CameraWithAutoFocus.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E13_ShuffleAndPartitionIceCream.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E14_TrainTest.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E15_TestX123.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E16_AmazonTrainTest.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E17_AmazonLogistic.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E18_LogisticRidgeRegression.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E19_LinearRidgeRegression.java create mode 100644 src/main/java/ai/preferred/regression/exercise/E20_GrandFinale.java create mode 100644 src/main/java/ai/preferred/regression/io/ARFFDataReader.java create mode 100644 src/main/java/ai/preferred/regression/io/CSVInputData.java create mode 100644 src/main/java/ai/preferred/regression/io/CSVUtils.java create mode 100644 src/main/java/ai/preferred/regression/pe/AddX2.java create mode 100644 src/main/java/ai/preferred/regression/pe/AddX3.java create mode 100644 src/main/java/ai/preferred/regression/pe/Dummy.java create mode 100644 src/main/java/ai/preferred/regression/pe/EncodeTextAsFrequency.java create mode 100644 src/main/java/ai/preferred/regression/pe/EncodeValueAsOneHot.java create mode 100644 src/main/java/ai/preferred/regression/pe/Partition.java create mode 100644 src/main/java/ai/preferred/regression/pe/ProcessingElement.java create mode 100644 src/main/java/ai/preferred/regression/pe/ProjectColumns.java create mode 100644 src/main/java/ai/preferred/regression/pe/RemoveColumn.java create mode 100644 src/main/java/ai/preferred/regression/pe/SelectEquals.java create mode 100644 src/main/java/ai/preferred/regression/pe/Shuffle.java create mode 100644 src/main/java/ai/preferred/regression/pe/SwapColumns.java create mode 100644 src/main/java/ai/preferred/regression/pe/data/Vocabulary.java create mode 100644 src/main/java/ai/preferred/regression/plot/XYChart.java create mode 100644 src/main/java/ai/preferred/regression/reset/DataFiles.java create mode 100644 src/main/resources/log4j.properties create mode 100644 src/test/java/.gitkeep create mode 100644 src/test/resources/.gitkeep create mode 100644 temp/.gitkeep create mode 100644 utils/assemble.py diff --git a/data/amazon.csv b/data/amazon.csv new file mode 100644 index 0000000..b6caca8 --- /dev/null +++ b/data/amazon.csv @@ -0,0 +1,61 @@ +Id,Rating,Text +1,5,I only spent less than ten on these so they're good for what I paid for +2,5,I'm in love with these glasses. +3,5,Stylish. My kid loved them +4,5,They came in great condition. +5,5,These are really wonderful! +6,5,these are GREAT quality +7,5,She LOVES them! +8,5,Love these. +9,5,The quality is pretty good also. +10,5,EXCELLENT PRODUCT +11,5,I love them. Exactly what i wanted. +12,5,Son love them +13,5,He says they give him that style. +14,5,Great value!!! +15,5,Very complimentary! +16,5,"Cute, great quality, good fit." +17,5,I love these glasses!! +18,5,they fit perfectly. +19,5,They look expensive and the fit is perfect +20,5,Sturdy and good looking for a great price +21,5,Very stylish! Great accessory to compliment an outfit +22,5,Thanks so much my grandson enjoy them. +23,5,Daughter loves them. +24,5,Makes me look smarter in my tinder profile ! +25,1,the side arms keep breaking +26,1,just look soooo cheap! +27,1,Not my style. +28,1,Mine arrived broken!! Not worth sending back. +29,1,Dollar store quality. +30,1,Not like picture. +31,1,We're cheap and broke right away.  +32,1,"These are so cheap looking, they are unwearable." +33,1,Very Very VERY Round ! Not at all vintage . +34,1,It's a peace of garbage. Feels so cheap and plastic. +35,1,feel flimsy like it would break i returned it the next day +36,1,lens have too much glare +37,1,they look cheaply made and plastic +38,1,Very cheap looking +39,1,make me headache +40,1,Feel apart after a week of getting them prescribed. +41,1,Really cheap looking. +42,1,Glasses are crooked and not made correctly. +43,1,poorly made... broke after three days +44,1,Sunglasses were very small. +45,1,Look fake and cheap +46,1,lens fell out on first day. +47,1,Not really like it! +48,1,Delivered broken. +49,1,"Overall, trash." +50,1,"Were broken when I opened the box, very disappointed" +51,1,Horrible lens fell out 2nd day! +52,1,I don't like them. +53,1,These hoes broke too I want my money +54,1,Broke within the 3 days +55,1,Little small but still good +56,1,Super small +57,1,Horrible desing +58,1,It's broke +59,1,Crooked and cheaply made. +60,1,Poor quality diff --git a/data/amazon_extended.csv b/data/amazon_extended.csv new file mode 100644 index 0000000..86f9029 --- /dev/null +++ b/data/amazon_extended.csv @@ -0,0 +1,61 @@ +Id,Rating,Text,Verified Purchase,Helpful +1,5,I only spent less than ten on these so they're good for what I paid for,YES,11 +2,5,I'm in love with these glasses.,YES,2 +3,5,Stylish. My kid loved them,YES,2 +4,5,They came in great condition.,NO,0 +5,5,These are really wonderful!,YES,0 +6,5,these are GREAT quality,YES,0 +7,5,She LOVES them!,YES,0 +8,5,Love these.,YES,0 +9,5,The quality is pretty good also.,YES,3 +10,5,EXCELLENT PRODUCT,YES,0 +11,5,I love them. Exactly what i wanted.,NO,0 +12,5,Son love them,YES,0 +13,5,He says they give him that style.,YES,0 +14,5,Great value!!!,YES,0 +15,5,Very complimentary!,YES,0 +16,5,"Cute, great quality, good fit.",YES,1 +17,5,I love these glasses!!,YES,0 +18,5,they fit perfectly.,YES,0 +19,5,They look expensive and the fit is perfect,NO,0 +20,5,Sturdy and good looking for a great price,YES,0 +21,5,Very stylish! Great accessory to compliment an outfit,YES,0 +22,5,Thanks so much my grandson enjoy them.,YES,0 +23,5,Daughter loves them.,YES,0 +24,5,Makes me look smarter in my tinder profile !,YES,0 +25,1,the side arms keep breaking,YES,0 +26,1,just look soooo cheap!,NO,0 +27,1,Not my style.,NO,0 +28,1,Mine arrived broken!! Not worth sending back.,YES,0 +29,1,Dollar store quality.,YES,5 +30,1,Not like picture.,NO,0 +31,1,We're cheap and broke right away. ,YES,0 +32,1,"These are so cheap looking, they are unwearable.",YES,0 +33,1,Very Very VERY Round ! Not at all vintage .,YES,0 +34,1,It's a peace of garbage. Feels so cheap and plastic.,YES,0 +35,1,feel flimsy like it would break i returned it the next day,NO,0 +36,1,lens have too much glare,YES,0 +37,1,they look cheaply made and plastic,YES,0 +38,1,Very cheap looking,NO,0 +39,1,make me headache,YES,0 +40,1,Feel apart after a week of getting them prescribed.,NO,0 +41,1,Really cheap looking.,YES,0 +42,1,Glasses are crooked and not made correctly.,YES,7 +43,1,poorly made... broke after three days,YES,2 +44,1,Sunglasses were very small.,NO,0 +45,1,Look fake and cheap,YES,0 +46,1,lens fell out on first day.,NO,0 +47,1,Not really like it!,YES,2 +48,1,Delivered broken.,YES,0 +49,1,"Overall, trash.",YES,0 +50,1,"Were broken when I opened the box, very disappointed",NO,0 +51,1,Horrible lens fell out 2nd day!,YES,5 +52,1,I don't like them.,NO,0 +53,1,These hoes broke too I want my money,NO,0 +54,1,Broke within the 3 days,NO,0 +55,1,Little small but still good,NO,4 +56,1,Super small,NO,0 +57,1,Horrible desing,NO,0 +58,1,It's broke,NO,0 +59,1,Crooked and cheaply made.,NO,0 +60,1,Poor quality,YES,0 diff --git a/data/camera.csv b/data/camera.csv new file mode 100644 index 0000000..f34f56b --- /dev/null +++ b/data/camera.csv @@ -0,0 +1,13 @@ +Id,Price (USD),Type,Focus +1,949,MIRRORLESS,MANUAL +2,99,DSLR,BOTH +3,90,DSLR,BOTH +4,80,DSLR,AUTO +5,20,COMPACT,MANUAL +6,50,COMPACT,AUTO +7,49,COMPACT,AUTO +8,30,COMPACT,AUTO +9,800,MIRRORLESS,AUTO +10,789,MIRRORLESS,MANUAL +11,35,COMPACT,AUTO +12,789,MIRRORLESS,BOTH diff --git a/data/icecream.csv b/data/icecream.csv new file mode 100644 index 0000000..2832a4d --- /dev/null +++ b/data/icecream.csv @@ -0,0 +1,31 @@ +Consumption,Temperature +0.386,5.00 +0.374,13.33 +0.393,17.22 +0.425,20.00 +0.406,20.56 +0.344,18.33 +0.327,16.11 +0.288,8.33 +0.269,0.00 +0.256,-4.44 +0.286,-2.22 +0.298,-3.33 +0.329,0.00 +0.318,4.44 +0.381,12.78 +0.381,17.22 +0.47,22.22 +0.443,22.22 +0.386,19.44 +0.342,15.56 +0.319,6.67 +0.307,4.44 +0.284,0.00 +0.326,-2.78 +0.309,-2.22 +0.359,0.56 +0.376,5.00 +0.416,11.11 +0.437,17.78 +0.548,21.67 diff --git a/data/icecream_raw.csv b/data/icecream_raw.csv new file mode 100644 index 0000000..04e3f08 --- /dev/null +++ b/data/icecream_raw.csv @@ -0,0 +1,31 @@ +Id,Temperature,Consumption +1,5.00,0.386 +2,13.33,0.374 +3,17.22,0.393 +4,20.00,0.425 +5,20.56,0.406 +6,18.33,0.344 +7,16.11,0.327 +8,8.33,0.288 +9,0.00,0.269 +10,-4.44,0.256 +11,-2.22,0.286 +12,-3.33,0.298 +13,0.00,0.329 +14,4.44,0.318 +15,12.78,0.381 +16,17.22,0.381 +17,22.22,0.47 +18,22.22,0.443 +19,19.44,0.386 +20,15.56,0.342 +21,6.67,0.319 +22,4.44,0.307 +23,0.00,0.284 +24,-2.78,0.326 +25,-2.22,0.309 +26,0.56,0.359 +27,5.00,0.376 +28,11.11,0.416 +29,17.78,0.437 +30,21.67,0.548 diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..5c0ec4c --- /dev/null +++ b/pom.xml @@ -0,0 +1,124 @@ + + + 4.0.0 + + ai.preferred + regression-analysis + 1.0-SNAPSHOT + + + UTF-8 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.6.0 + + 1.8 + 1.8 + + + + maven-surefire-plugin + 2.22.0 + + + org.junit.platform + junit-platform-surefire-provider + 1.2.0 + + + org.junit.jupiter + junit-jupiter-engine + 5.2.0 + + + + + org.codehaus.mojo + exec-maven-plugin + 1.6.0 + + + + java + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.1.0 + + + package + + shade + + + false + ${project.artifactId}-${project.version}-jar-with-dependencies + + + + + + + + + src/main/resources + + + + + + src/test/resources + + + + + + + org.slf4j + slf4j-log4j12 + [1.7,1.8) + + + nz.ac.waikato.cms.weka + weka-stable + 3.8.3 + + + org.apache.commons + commons-csv + 1.6 + + + args4j + args4j + 2.33 + + + org.apache.commons + commons-lang3 + 3.8.1 + + + com.google.guava + guava + 19.0 + + + org.jfree + jfreechart + 1.5.0 + + + + \ No newline at end of file diff --git a/src/main/java/ai/preferred/regression/ApplyRegression.java b/src/main/java/ai/preferred/regression/ApplyRegression.java new file mode 100644 index 0000000..2041ed6 --- /dev/null +++ b/src/main/java/ai/preferred/regression/ApplyRegression.java @@ -0,0 +1,66 @@ +package ai.preferred.regression; + +import ai.preferred.regression.io.ARFFDataReader; +import ai.preferred.regression.io.CSVInputData; +import ai.preferred.regression.io.CSVUtils; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import weka.classifiers.Classifier; +import weka.core.Instances; +import weka.core.SerializationHelper; + +import java.io.File; +import java.io.FileInputStream; +import java.util.ArrayList; + +public class ApplyRegression extends Command { + + @Option(name = "-s", aliases = {"--train"}, usage = "the path to the training data", required = true) + private File train; + + @Option(name = "-i", aliases = {"--test"}, usage = "the path to the testing data", required = true) + private File test; + + @Option(name = "-o", aliases = {"--output"}, usage = "the path to the output CSV file", required = true) + private File output; + + @Option(name = "-m", aliases = {"--model"}, usage = "the path to the model file", required = true) + private File model; + + @Option(name = "-h", aliases = {"--header"}, usage = "specifies if the input CSV files have headers") + private boolean header = true; + + @Override + protected void exec() throws Exception { + try (final FileInputStream stream = new FileInputStream(model)) { + final Classifier classifier = (Classifier) SerializationHelper.read(stream); + final boolean nominal = WekaUtils.isLogisticClassifier(classifier); + + final ARFFDataReader reader = new ARFFDataReader(train, nominal, header); + final Instances data = reader.read(test); + + try (final CSVPrinter printer = CSVUtils.printer(output); + final CSVInputData csvData = CSVUtils.reader(test, header)) { + if (csvData.hasHeader()) { + printer.printRecord(csvData.getHeader()); + } + + int index = 0; + for (final ArrayList record : csvData) { + final double prediction = classifier.classifyInstance(data.get(index)); + if (nominal) { + record.set(0, data.classAttribute().value((int) prediction)); + } else { + record.set(0, String.valueOf(prediction)); + } + printer.printRecord(record); + } + } + } + } + + public static void main(String[] args) { + parseArgsAndRun(ApplyRegression.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/Command.java b/src/main/java/ai/preferred/regression/Command.java new file mode 100644 index 0000000..ca84173 --- /dev/null +++ b/src/main/java/ai/preferred/regression/Command.java @@ -0,0 +1,58 @@ +package ai.preferred.regression; + +import org.kohsuke.args4j.CmdLineException; +import org.kohsuke.args4j.CmdLineParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class Command { + + private static final Logger LOGGER = LoggerFactory.getLogger(Command.class); + + protected abstract void exec() throws Exception; + + protected static void parseArgsAndRun(Class clazz, String[] args) { + Command command = null; + try { + command = clazz.newInstance(); + } catch (IllegalAccessException | InstantiationException e) { + System.err.println("Please check if there is the public default constructor for the class: " + clazz.getCanonicalName()); + System.exit(1); + } + + if (args == null) { + System.out.println("=========== HELP ==========="); + System.out.println(); + System.out.println("Processing Element: " + clazz.getSimpleName() + ".class"); + System.out.println(); + System.out.println("Shell.run(" + clazz.getSimpleName() + ".class, \"\");"); + final CmdLineParser parser = new CmdLineParser(command); + System.out.println(); + parser.printUsage(System.out); + System.out.println(); + System.out.println("============================"); + System.out.println(); + System.out.println(); + return; + } + + final CmdLineParser parser = new CmdLineParser(command); + try { + parser.parseArgument(args); + } catch (CmdLineException e) { + System.err.println("Command: " + clazz.getCanonicalName()); + System.err.println(e.getMessage()); + System.err.println(); + parser.printUsage(System.err); + System.exit(1); + } + + try { + command.exec(); + } catch (Exception e) { + LOGGER.error("Unable to execute command (" + clazz.getCanonicalName() + "): ", e); + System.exit(1); + } + } + +} diff --git a/src/main/java/ai/preferred/regression/EvaluateRegression.java b/src/main/java/ai/preferred/regression/EvaluateRegression.java new file mode 100644 index 0000000..3bd327a --- /dev/null +++ b/src/main/java/ai/preferred/regression/EvaluateRegression.java @@ -0,0 +1,68 @@ +package ai.preferred.regression; + +import ai.preferred.regression.io.ARFFDataReader; +import org.kohsuke.args4j.Option; +import weka.classifiers.Classifier; +import weka.classifiers.evaluation.Evaluation; +import weka.core.Instances; +import weka.core.SerializationHelper; + +import java.io.File; +import java.io.FileInputStream; + +public class EvaluateRegression extends Command { + + @Option(name = "-s", aliases = {"--train"}, usage = "the path to the training data", required = true) + private File train; + + @Option(name = "-i", aliases = {"--test"}, usage = "the path to the testing data", required = true) + private File test; + + @Option(name = "-m", aliases = {"--model"}, usage = "the path to the model file", required = true) + private File model; + + @Option(name = "-v", aliases = {"--verbose"}, usage = "verbosity level (0 - short, 1 - default, 2 - detailed)") + private int verbose = 1; + + @Option(name = "-h", aliases = {"--header"}, usage = "specifies if the input CSV files have headers") + private boolean header = true; + + @Override + protected void exec() throws Exception { + try (final FileInputStream stream = new FileInputStream(model)) { + final Classifier classifier = (Classifier) SerializationHelper.read(stream); + final boolean nominal = WekaUtils.isLogisticClassifier(classifier); + + final ARFFDataReader reader = new ARFFDataReader(train, nominal, header); + final Instances data = reader.read(test); + + final Evaluation eval = new Evaluation(data); + eval.evaluateModel(classifier, data); + if (nominal) { + if (verbose <= 0) { + System.out.println(eval.pctCorrect()); + } else if (verbose == 1) { + System.out.println("ACCURACY = " + eval.pctCorrect()); + } else { + System.out.println(); + System.out.println("CLASS\tPRECISION\tRECALL\tF-MEASURE"); + for (int i = 0; i < data.classAttribute().numValues(); i++) { + System.out.printf("%s\t%f\t%f\t%f", data.classAttribute().value(i), eval.precision(i), eval.recall(i), eval.fMeasure(i)); + System.out.println(); + } + } + } else { + if (verbose <= 0) { + System.out.println(eval.rootMeanSquaredError()); + } else { + System.out.println("RMSE = " + eval.rootMeanSquaredError()); + } + } + } + } + + public static void main(String[] args) { + parseArgsAndRun(EvaluateRegression.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/PlotData.java b/src/main/java/ai/preferred/regression/PlotData.java new file mode 100644 index 0000000..d3a0b71 --- /dev/null +++ b/src/main/java/ai/preferred/regression/PlotData.java @@ -0,0 +1,44 @@ +package ai.preferred.regression; + +import ai.preferred.regression.io.ARFFDataReader; +import ai.preferred.regression.plot.XYChart; +import org.jfree.data.xy.XYSeries; +import org.kohsuke.args4j.Option; +import weka.core.Instance; +import weka.core.Instances; + +import javax.swing.*; +import java.io.File; + +public class PlotData extends Command { + + @Option(name = "-i", aliases = {"--input"}, usage = "the path to the input CSV file", required = true) + private File input; + + @Option(name = "-n", aliases = {"--name"}, usage = "the name of the plot") + private String name = "DATA"; + + @Option(name = "-h", aliases = {"--header"}, usage = "specifies if the input CSV files have headers") + private boolean header = true; + + @Override + protected void exec() throws Exception { + final ARFFDataReader reader = new ARFFDataReader(input, false, header); + final Instances data = reader.read(input); + + final XYSeries dataSeries = new XYSeries("DATA"); + for (final Instance datum : data) { + dataSeries.add(datum.value(1), datum.value(0)); + } + + final XYChart chart = new XYChart(name, dataSeries, new XYSeries("REGRESSION")); + chart.pack(); + chart.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE); + chart.setVisible(true); + } + + public static void main(String[] args) { + parseArgsAndRun(PlotData.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/PlotLinearRegression.java b/src/main/java/ai/preferred/regression/PlotLinearRegression.java new file mode 100644 index 0000000..a121352 --- /dev/null +++ b/src/main/java/ai/preferred/regression/PlotLinearRegression.java @@ -0,0 +1,67 @@ +package ai.preferred.regression; + +import ai.preferred.regression.io.ARFFDataReader; +import ai.preferred.regression.plot.XYChart; +import org.jfree.data.xy.XYSeries; +import org.kohsuke.args4j.Option; +import weka.classifiers.Classifier; +import weka.classifiers.functions.LinearRegression; +import weka.core.Instance; +import weka.core.Instances; +import weka.core.SerializationHelper; + +import javax.swing.*; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; + +public class PlotLinearRegression extends Command { + + @Option(name = "-i", aliases = {"--input"}, usage = "the path to the input CSV file", required = true) + private File input; + + @Option(name = "-m", aliases = {"--model"}, usage = "the path to the model file", required = true) + private File model; + + @Option(name = "-n", aliases = {"--name"}, usage = "the name of the plot") + private String name = "Y = alpha * X + beta"; + + @Option(name = "-h", aliases = {"--header"}, usage = "specifies if the input CSV files have headers") + private boolean header = true; + + @Override + protected void exec() throws Exception { + try (final FileInputStream stream = new FileInputStream(model)) { + final Classifier classifier = (Classifier) SerializationHelper.read(stream); + if (!(classifier instanceof LinearRegression)) { + throw new IOException("The model is neither LogisticRegression nor LinearRegression!"); + } + final double[] w = ((LinearRegression) classifier).coefficients(); + + if (w.length != 3) { + throw new IOException("We can plot only linear functions!"); + } + + final ARFFDataReader reader = new ARFFDataReader(input, false, header); + final Instances data = reader.read(input); + + final XYSeries dataSeries = new XYSeries("DATA"); + for (final Instance datum : data) { + dataSeries.add(datum.value(1), datum.value(0)); + } + final XYSeries regressionSeries = new XYSeries("REGRESSION"); + regressionSeries.add(dataSeries.getMinX(), w[1] * dataSeries.getMinX() + w[2]); + regressionSeries.add(dataSeries.getMaxX(), w[1] * dataSeries.getMaxX() + w[2]); + + final XYChart chart = new XYChart(name, dataSeries, regressionSeries); + chart.pack(); + chart.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE); + chart.setVisible(true); + } + } + + public static void main(String[] args) { + parseArgsAndRun(PlotLinearRegression.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/PrintRegression.java b/src/main/java/ai/preferred/regression/PrintRegression.java new file mode 100644 index 0000000..1e417cc --- /dev/null +++ b/src/main/java/ai/preferred/regression/PrintRegression.java @@ -0,0 +1,110 @@ +package ai.preferred.regression; + +import ai.preferred.regression.io.ARFFDataReader; +import org.kohsuke.args4j.Option; +import weka.classifiers.Classifier; +import weka.classifiers.functions.LinearRegression; +import weka.classifiers.functions.Logistic; +import weka.core.Attribute; +import weka.core.Instances; +import weka.core.SerializationHelper; +import weka.filters.Filter; +import weka.filters.unsupervised.attribute.RemoveUseless; +import weka.filters.unsupervised.attribute.ReplaceMissingValues; + +import java.io.File; +import java.io.FileInputStream; +import java.util.ArrayList; +import java.util.BitSet; + +public class PrintRegression extends Command { + + @Option(name = "-i", aliases = {"--input"}, usage = "the path to the input CSV file", required = true) + private File input; + + @Option(name = "-m", aliases = {"--model"}, usage = "the path to the model file", required = true) + private File model; + + @Option(name = "-h", aliases = {"--header"}, usage = "specifies if the input CSV files have headers") + private boolean header = true; + + private void printSignature(ArrayList signature) { + for (int i = 1; i < signature.size(); i++) { + System.out.print(signature.get(i).name() + "\t"); + } + System.out.println("Bias"); + } + + @Override + protected void exec() throws Exception { + System.out.println(); + try (final FileInputStream stream = new FileInputStream(model)) { + final Classifier classifier = (Classifier) SerializationHelper.read(stream); + + if (classifier instanceof LinearRegression) { + final ARFFDataReader reader = new ARFFDataReader(input, false, header); + final Instances instances = preprocess(reader.read(input)); + + final BitSet ignore = new BitSet(instances.numAttributes()); + for (int i = 0; i < instances.numAttributes(); i++) { + if (i != instances.classIndex()) { + if (Math.sqrt(instances.variance(i)) == 0) { + ignore.set(i); + } + } + } + + final double[] w = ((LinearRegression) classifier).coefficients(); + System.out.printf("%-20s W", "FEATURE"); + System.out.println(); + for (int i = 1; i < instances.numAttributes(); i++) { + if (ignore.get(i)) { + continue; + } + System.out.printf("%-20s %.6f", instances.attribute(i).name(), w[i]); + System.out.println(); + } + System.out.printf("%-20s %.6f", "Bias", w[instances.numAttributes()]); + System.out.println(); + } else if (classifier instanceof Logistic) { + final ARFFDataReader reader = new ARFFDataReader(input, true, header); + final Instances instances = preprocess(reader.read(input)); + + final double[][] w = ((Logistic) classifier).coefficients(); + for (int i = 0; i < instances.classAttribute().numValues(); i++) { + System.out.printf("%s %s", "CLASS[" + i + "] =", instances.classAttribute().value(i)); + System.out.println(); + } + System.out.println(); + + System.out.printf("%-20s W", "FEATURE"); + System.out.println(); + for (int i = 1; i < instances.numAttributes(); i++) { + System.out.printf("%-20s %.6f", instances.attribute(i).name(), w[i][0]); + System.out.println(); + } + System.out.printf("%-20s %.6f", "Bias", w[0][0]); + System.out.println(); + } else { + throw new RuntimeException("We can process only regression models!"); + } + } + } + + private Instances preprocess(Instances instances) throws Exception { + final ReplaceMissingValues replaceMissingValues = new ReplaceMissingValues(); + replaceMissingValues.setInputFormat(instances); + instances = Filter.useFilter(instances, replaceMissingValues); + + final RemoveUseless removeUseless = new RemoveUseless(); + removeUseless.setInputFormat(instances); + instances = Filter.useFilter(instances, removeUseless); + + return instances; + } + + public static void main(String[] args) { + parseArgsAndRun(PrintRegression.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/Shell.java b/src/main/java/ai/preferred/regression/Shell.java new file mode 100644 index 0000000..436d4cc --- /dev/null +++ b/src/main/java/ai/preferred/regression/Shell.java @@ -0,0 +1,146 @@ +package ai.preferred.regression; + +import ai.preferred.regression.reset.DataFiles; +import com.google.common.io.Files; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.nio.charset.StandardCharsets; +import java.util.Objects; + +public class Shell { + + private static final Logger LOGGER = LoggerFactory.getLogger(Shell.class); + + public static void reset() { + final File tempDir = new File("temp"); + mkdir(tempDir); + for (final File file : Objects.requireNonNull(tempDir.listFiles())) { + if (!file.getName().startsWith(".") && !file.delete()) { + LOGGER.error("Unable to delete: {}", file); + } + } + mkdir(tempDir); + + File dataDir = new File("data"); + mkdir(dataDir); + write(new File(dataDir, "icecream.csv"), DataFiles.ICECREAM_CSV); + write(new File(dataDir, "icecream_raw.csv"), DataFiles.ICECREAM_RAW_CSV); + write(new File(dataDir, "amazon.csv"), DataFiles.AMAZON_CSV); + write(new File(dataDir, "camera.csv"), DataFiles.CAMERA_CSV); + write(new File(dataDir, "amazon_extended.csv"), DataFiles.AMAZON_EXTENDED); + } + + private static void write(File file, String content) { + try (final PrintWriter writer = new PrintWriter(file, "UTF-8")) { + writer.write(content); + } catch (FileNotFoundException | UnsupportedEncodingException e) { + LOGGER.error("Unable to reset file {}: {}", file, e); + } + } + + private static void mkdir(File tempDir) { + if (!tempDir.exists() && !tempDir.mkdirs()) { + LOGGER.error("Unable to mkdir: {}"); + } + } + + public static void help(Class clazz) { + try { + final Method method = clazz.getMethod("main", String[].class); + method.invoke(null, (Object) null); + } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { + LOGGER.error("Unable to execute {}: {}", clazz, e); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public static void copyFile(String src, String dst) { + try { + Files.copy(new File(src), new File(dst)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public static void run(Class clazz, String... args) { + try { + final Method method = clazz.getMethod("main", String[].class); + method.invoke(null, (Object) args); + } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) { + LOGGER.error("Unable to execute {}: {}", clazz, e); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public static void run(Class clazz, String args) { + run(clazz, args.trim().split("\\s+")); + } + + private static Class pe(String name) { + final String className = "ai.preferred.regression.pe." + name; + try { + return Class.forName(className); + } catch (ClassNotFoundException e) { + LOGGER.info("Could not find PE: {}", className); + return null; + } + } + + private static Class command(String name) { + final String className = "ai.preferred.regression." + name; + try { + return Class.forName(className); + } catch (ClassNotFoundException e) { + LOGGER.info("Could not find command: {}", className); + return null; + } + } + + public static void exec(String filename) { + try (final BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), StandardCharsets.UTF_8))) { + String line; + while (null != (line = reader.readLine())) { + line = line.trim(); + if (line.isEmpty()) { + continue; + } + final String[] command = line.split("\\s+", 2); + final String name = command[0]; + final String args = command[1]; + + Class clazz = pe(name); + if (clazz == null) { + clazz = command(name); + } + + if (clazz == null) { + LOGGER.error("Unable to execute command: {}", name); + return; + } + + run(clazz, args); + } + } catch (FileNotFoundException e) { + LOGGER.error("Unable to find input file: {}", filename); + } catch (IOException e) { + LOGGER.error("Execution error: ", e); + } + } + + public static void main(String[] args) { + for (final String arg : args) { + exec(arg); + } + } + + private Shell() { + throw new AssertionError(); + } + +} diff --git a/src/main/java/ai/preferred/regression/TrainLinearRegression.java b/src/main/java/ai/preferred/regression/TrainLinearRegression.java new file mode 100644 index 0000000..bfd5bf1 --- /dev/null +++ b/src/main/java/ai/preferred/regression/TrainLinearRegression.java @@ -0,0 +1,54 @@ +package ai.preferred.regression; + +import ai.preferred.regression.io.ARFFDataReader; +import org.kohsuke.args4j.Option; +import weka.classifiers.evaluation.Evaluation; +import weka.core.Instances; +import weka.core.SerializationHelper; + +import java.io.File; +import java.io.FileOutputStream; + +public class TrainLinearRegression extends Command { + + @Option(name = "-i", aliases = {"--train"}, usage = "the path to the training data in CSV format", required = true) + private File input; + + @Option(name = "-m", aliases = {"--model"}, usage = "the output path to the model file", required = true) + private File model; + + @Option(name = "-h", aliases = {"--header"}, usage = "specifies if the input CSV files have headers") + private boolean header = true; + + @Option(name = "-r", aliases = {"--ridge"}, usage = "the ridge parameter") + private double ridge = 1.0; + + @Option(name = "-v", aliases = {"--verbose"}, usage = "verbosity level (-1 - disable, 0 - short, 1 - default)") + private int verbose = 1; + + @Override + protected void exec() throws Exception { + final ARFFDataReader reader = new ARFFDataReader(input, false, header); + final Instances data = reader.read(input); + final weka.classifiers.functions.LinearRegression classifier = new weka.classifiers.functions.LinearRegression(); + classifier.setRidge(ridge); + classifier.buildClassifier(data); + + final Evaluation eval = new Evaluation(data); + eval.evaluateModel(classifier, data); + if (verbose <= -1) { + // output disabled + } else if (verbose == 0) { + System.out.println(eval.rootMeanSquaredError()); + } else { + System.out.println("RMSE[TRAINING] = " + eval.rootMeanSquaredError()); + } + + SerializationHelper.write(new FileOutputStream(model), classifier); + } + + public static void main(String[] args) { + parseArgsAndRun(TrainLinearRegression.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/TrainLogisticRegression.java b/src/main/java/ai/preferred/regression/TrainLogisticRegression.java new file mode 100644 index 0000000..fb8f4d3 --- /dev/null +++ b/src/main/java/ai/preferred/regression/TrainLogisticRegression.java @@ -0,0 +1,62 @@ +package ai.preferred.regression; + +import ai.preferred.regression.io.ARFFDataReader; +import org.kohsuke.args4j.Option; +import weka.classifiers.evaluation.Evaluation; +import weka.core.Instances; +import weka.core.SerializationHelper; + +import java.io.File; +import java.io.FileOutputStream; + +public class TrainLogisticRegression extends Command { + + @Option(name = "-i", aliases = {"--train"}, usage = "the path to the training data in CSV format", required = true) + private File input; + + @Option(name = "-m", aliases = {"--model"}, usage = "the output path to the model file", required = true) + private File model; + + @Option(name = "-h", aliases = {"--header"}, usage = "specifies if the input CSV files have headers") + private boolean header = true; + + @Option(name = "-r", aliases = {"--ridge"}, usage = "the ridge parameter") + private double ridge = 1.0; + + @Option(name = "-v", aliases = {"--verbose"}, usage = "verbosity level (-1 - disable, 0 - short, 1 - default, 2 - detailed)") + private int verbose = 1; + + @Override + protected void exec() throws Exception { + final ARFFDataReader reader = new ARFFDataReader(input, true, header); + final Instances data = reader.read(input); + final weka.classifiers.functions.Logistic classifier = new weka.classifiers.functions.Logistic(); + classifier.setRidge(ridge); + classifier.buildClassifier(data); + + final Evaluation eval = new Evaluation(data); + eval.evaluateModel(classifier, data); + if (verbose <= -1) { + // output disabled + } else if (verbose == 0) { + System.out.println(eval.pctCorrect()); + } else if (verbose == 1) { + System.out.println("ACCURACY[TRAINING] = " + eval.pctCorrect()); + } else { + System.out.println(); + System.out.println("CLASS\tPRECISION\tRECALL\tF-MEASURE"); + for (int i = 0; i < data.classAttribute().numValues(); i++) { + System.out.printf("%s\t%f\t%f\t%f", data.classAttribute().value(i), eval.precision(i), eval.recall(i), eval.fMeasure(i)); + System.out.println(); + } + } + + SerializationHelper.write(new FileOutputStream(model), classifier); + } + + public static void main(String[] args) { + parseArgsAndRun(TrainLogisticRegression.class, args); + } + + +} diff --git a/src/main/java/ai/preferred/regression/WekaUtils.java b/src/main/java/ai/preferred/regression/WekaUtils.java new file mode 100644 index 0000000..8f891d5 --- /dev/null +++ b/src/main/java/ai/preferred/regression/WekaUtils.java @@ -0,0 +1,25 @@ +package ai.preferred.regression; + +import weka.classifiers.Classifier; +import weka.classifiers.functions.LinearRegression; +import weka.classifiers.functions.Logistic; + +public class WekaUtils { + + static boolean isLogisticClassifier(Classifier classifier) { + boolean nominal; + if (classifier instanceof Logistic) { + nominal = true; + } else if (classifier instanceof LinearRegression) { + nominal = false; + } else { + throw new IllegalStateException("The model is neither LogisticRegression nor LinearRegression!"); + } + return nominal; + } + + private WekaUtils() { + throw new AssertionError(); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E00_IceCream.java b/src/main/java/ai/preferred/regression/exercise/E00_IceCream.java new file mode 100644 index 0000000..ff12e33 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E00_IceCream.java @@ -0,0 +1,23 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.PlotData; +import ai.preferred.regression.Shell; + +public class E00_IceCream { + + /** + * DATA: data/icecream.csv + *

+ * TODO: + * Run this class to plot the input data, take a look at it! + * You can open the data file in Excel or Google Spreadsheet. + *

+ * CHECK: Is it possible to approximate this data with a linear function? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.run(PlotData.class, "-i data/icecream.csv -n IceCream"); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E01_MyFirstRegressionWithIceCream.java b/src/main/java/ai/preferred/regression/exercise/E01_MyFirstRegressionWithIceCream.java new file mode 100644 index 0000000..ced11b0 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E01_MyFirstRegressionWithIceCream.java @@ -0,0 +1,24 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.PlotLinearRegression; +import ai.preferred.regression.Shell; +import ai.preferred.regression.TrainLinearRegression; + +public class E01_MyFirstRegressionWithIceCream { + + /** + * DATA: data/icecream.csv + *

+ * TODO: + * Train a linear regression on 'icecream.csv'. Plot the regression line. + *

+ * CHECK: What is the value of RMSE[TRAINING] for this dataset? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.run(TrainLinearRegression.class, "-i data/icecream.csv -m temp/icecream.model"); + Shell.run(PlotLinearRegression.class, "-i data/icecream.csv -m temp/icecream.model"); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E02_ReadingRegression.java b/src/main/java/ai/preferred/regression/exercise/E02_ReadingRegression.java new file mode 100644 index 0000000..57ebe33 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E02_ReadingRegression.java @@ -0,0 +1,24 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.PrintRegression; +import ai.preferred.regression.Shell; +import ai.preferred.regression.TrainLinearRegression; + +public class E02_ReadingRegression { + + /** + * DATA: data/icecream.csv + *

+ * TODO: + * You can print the regression weights, to understand it a bit better! + *

+ * CHECK: What is the value of the regression when Temperature is 0? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.run(TrainLinearRegression.class, "-i data/icecream.csv -m temp/icecream.model"); + Shell.run(PrintRegression.class, "-i data/icecream.csv -m temp/icecream.model"); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E03_RawIceCream.java b/src/main/java/ai/preferred/regression/exercise/E03_RawIceCream.java new file mode 100644 index 0000000..2c73422 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E03_RawIceCream.java @@ -0,0 +1,25 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; +import ai.preferred.regression.pe.RemoveColumn; +import ai.preferred.regression.pe.SwapColumns; + +public class E03_RawIceCream { + + /** + * DATA: data/icecream_raw.csv ; data/icecream.csv + *

+ * TODO: + * Often, data come in a format which is not suitable for analysis or for building a regression. + * Convert 'icecream_raw.csv' to make it look like 'icecream.csv'. + *

+ * CHECK: Should you use RemoveColumn or SwapColumns as the first step? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.help(RemoveColumn.class); + Shell.help(SwapColumns.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E04_RegressionForTemperature.java b/src/main/java/ai/preferred/regression/exercise/E04_RegressionForTemperature.java new file mode 100644 index 0000000..d4477f4 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E04_RegressionForTemperature.java @@ -0,0 +1,20 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; + +public class E04_RegressionForTemperature { + + /** + * DATA: data/icecream_raw.csv + *

+ * TODO: + * Train and plot a regression predicting temperature based on consumption. + * Plot it! + *

+ * CHECK: What is the difference between consumption-regression and temperature-regression? + */ + public static void main(String[] args) { + Shell.reset(); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E05_TryX2Only.java b/src/main/java/ai/preferred/regression/exercise/E05_TryX2Only.java new file mode 100644 index 0000000..144553f --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E05_TryX2Only.java @@ -0,0 +1,23 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; +import ai.preferred.regression.pe.AddX2; + +public class E05_TryX2Only { + + /** + * DATA: data/icecream.csv + *

+ * TODO: + * Add column Temperature^2 and train linear regression to predict consumption based only on Temperature^2 feature. + * Plot the trained regression! + *

+ * CHECK: Is RMSE[TRAINING] different from 'E01_MyFirstRegressionWithIceCream'? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.help(AddX2.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E06_TryX1AndX2AndX3.java b/src/main/java/ai/preferred/regression/exercise/E06_TryX1AndX2AndX3.java new file mode 100644 index 0000000..b200f69 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E06_TryX1AndX2AndX3.java @@ -0,0 +1,22 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; +import ai.preferred.regression.pe.AddX3; + +public class E06_TryX1AndX2AndX3 { + + /** + * DATA: data/icecream.csv + *

+ * TODO: + * Add columns Temperature^2 and Temperature^3 and train linear regression using all the parameters! + *

+ * CHECK: Check RMSE[TRAINING] again, is it any different? + */ + public static void main(String[] args) { + Shell.reset(); + // TODO: implement AddX3.class, hint: take a look at the AddX2 class + Shell.help(AddX3.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E07_AmazonText.java b/src/main/java/ai/preferred/regression/exercise/E07_AmazonText.java new file mode 100644 index 0000000..23a8921 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E07_AmazonText.java @@ -0,0 +1,23 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; +import ai.preferred.regression.pe.EncodeTextAsFrequency; + +public class E07_AmazonText { + + /** + * DATA: data/amazon.csv + *

+ * TODO: + * Take a loot at 'amazon.csv', one of the columns contains text. + * Convert it into word frequencies using EncodeTextAsFrequency.class. + *

+ * CHECK: How many columns does the new dataset have after conversion? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.help(EncodeTextAsFrequency.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E08_AmazonCheap.java b/src/main/java/ai/preferred/regression/exercise/E08_AmazonCheap.java new file mode 100644 index 0000000..c50d09f --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E08_AmazonCheap.java @@ -0,0 +1,22 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; +import ai.preferred.regression.pe.ProjectColumns; + +public class E08_AmazonCheap { + + /** + * DATA: data/amazon.csv + *

+ * TODO: + * Train a linear regression for rating prediction based on word "cheap" only! Plot it! + *

+ * CHECK: What are the regression parameters? Is word "cheap" a good predictor? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.help(ProjectColumns.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E09_AmazonExpensive.java b/src/main/java/ai/preferred/regression/exercise/E09_AmazonExpensive.java new file mode 100644 index 0000000..c416c0a --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E09_AmazonExpensive.java @@ -0,0 +1,19 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; + +public class E09_AmazonExpensive { + + /** + * DATA: data/amazon.csv + *

+ * TODO: + * Train linear regression for rating prediction based on word "expensive" only. Plot it! + *

+ * CHECK: What are the regression parameters? Is word "expensive" a good predictor? + */ + public static void main(String[] args) { + Shell.reset(); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E10_AmazonYourOwnWord.java b/src/main/java/ai/preferred/regression/exercise/E10_AmazonYourOwnWord.java new file mode 100644 index 0000000..21dbb27 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E10_AmazonYourOwnWord.java @@ -0,0 +1,20 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; + +public class E10_AmazonYourOwnWord { + + /** + * DATA: data/amazon.csv + *

+ * TODO: + * Train a linear regression for rating prediction based on your own word. Plot it! + * We will discuss it! + *

+ * CHECK: What are the regression parameters? Is your word a good predictor? + */ + public static void main(String[] args) { + Shell.reset(); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E11_CameraCategories.java b/src/main/java/ai/preferred/regression/exercise/E11_CameraCategories.java new file mode 100644 index 0000000..e857ba6 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E11_CameraCategories.java @@ -0,0 +1,23 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; +import ai.preferred.regression.pe.EncodeValueAsOneHot; + +/** + * DATA: data/camera.csv + *

+ * TODO: + * Take a look at 'camera.csv'. It has a lot of categorical data, which is to be + * processed and represented as 0-1 values. + *

+ * CHECK: How many columns does the dataset have after processing? + */ +public class E11_CameraCategories { + + public static void main(String[] args) { + Shell.reset(); + + Shell.help(EncodeValueAsOneHot.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E12_CameraWithAutoFocus.java b/src/main/java/ai/preferred/regression/exercise/E12_CameraWithAutoFocus.java new file mode 100644 index 0000000..a512889 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E12_CameraWithAutoFocus.java @@ -0,0 +1,23 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; +import ai.preferred.regression.pe.SelectEquals; + +public class E12_CameraWithAutoFocus { + + /** + * DATA: data/camera.csv + *

+ * TODO: + * We are interested in the subset of 'camera.csv', the cameras with auto focus. + * Select this subset and train a linear regression to predict price based on camera type. + *

+ * CHECK: How many rows does the dataset have after processing? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.help(SelectEquals.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E13_ShuffleAndPartitionIceCream.java b/src/main/java/ai/preferred/regression/exercise/E13_ShuffleAndPartitionIceCream.java new file mode 100644 index 0000000..59f8e1d --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E13_ShuffleAndPartitionIceCream.java @@ -0,0 +1,26 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; +import ai.preferred.regression.pe.Partition; +import ai.preferred.regression.pe.Shuffle; + +public class E13_ShuffleAndPartitionIceCream { + + /** + * DATA: data/icecream.csv + *

+ * TODO: + * We are back to 'icecream.csv'. + * Shuffle and partition the data in proportion 80/20, 80% is for training data and 20% is for testing data. + * Plot the data splits. + *

+ * CHECK: How many rows are there in the training and testing datasets? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.help(Shuffle.class); + Shell.help(Partition.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E14_TrainTest.java b/src/main/java/ai/preferred/regression/exercise/E14_TrainTest.java new file mode 100644 index 0000000..dfe9dda --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E14_TrainTest.java @@ -0,0 +1,23 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.EvaluateRegression; +import ai.preferred.regression.Shell; + +public class E14_TrainTest { + + /** + * DATA: data/icecream.txt + *

+ * TODO: + * Shuffle and partition the data in proportion 60/40, 60% is for the training data and 40% is for the testing data. + * Train a regression on the training data and evaluate it on the testing data. + *

+ * CHECK: Is RMSE (on testing) > RMSE[TRAINING] or RMSE (on testing) < RMSE[TRAINING]? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.help(EvaluateRegression.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E15_TestX123.java b/src/main/java/ai/preferred/regression/exercise/E15_TestX123.java new file mode 100644 index 0000000..d883f4a --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E15_TestX123.java @@ -0,0 +1,21 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; + +public class E15_TestX123 { + + /** + * DATA: data/icecream.txt + *

+ * TODO: + * Let's continue with our previous split. + * Add more features to the dataset: Temperature^2 and Temperature^3. + * Train a regression model on the training data and evaluate in on the testing data. + *

+ * CHECK: Is RMSE (with more features) > RMSE (with only one feature)? + */ + public static void main(String[] args) { + Shell.reset(); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E16_AmazonTrainTest.java b/src/main/java/ai/preferred/regression/exercise/E16_AmazonTrainTest.java new file mode 100644 index 0000000..3173e7c --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E16_AmazonTrainTest.java @@ -0,0 +1,21 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; + +public class E16_AmazonTrainTest { + + /** + * DATA: data/amazon.csv + *

+ * TODO: + * Let's go to the amazon data: 'amazon.csv'. + * Split the data in proportion 80/20. + * Build a regression on the training split and evaluate it on the testing. + *

+ * CHECK: Is RMSE (on testing) > RMSE[TRAINING] or RMSE (on testing) < RMSE[TRAINING]? + */ + public static void main(String[] args) { + Shell.reset(); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E17_AmazonLogistic.java b/src/main/java/ai/preferred/regression/exercise/E17_AmazonLogistic.java new file mode 100644 index 0000000..d6717a0 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E17_AmazonLogistic.java @@ -0,0 +1,22 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; +import ai.preferred.regression.TrainLogisticRegression; + +public class E17_AmazonLogistic { + + /** + * DATA: data/amazon.csv + *

+ * TODO: + * Build and evaluate a logistic regression model. Data split: 80/20. + *

+ * CHECK: Is ACCURACY (on testing) > ACCURACY[TRAINING] or ACCURACY (on testing) < ACCURACY[TRAINING]? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.help(TrainLogisticRegression.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E18_LogisticRidgeRegression.java b/src/main/java/ai/preferred/regression/exercise/E18_LogisticRidgeRegression.java new file mode 100644 index 0000000..9cf5e30 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E18_LogisticRidgeRegression.java @@ -0,0 +1,23 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; +import ai.preferred.regression.TrainLogisticRegression; + + +public class E18_LogisticRidgeRegression { + + /** + * DATA: data/amazon.csv + *

+ * TODO: + * Build and evaluate a logistic regression model with ridge = {0.1, 1.0, 10.0}. Data split: 80/20. + *

+ * CHECK: Which ridge parameter gives the best ACCURACY (on testing)? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.help(TrainLogisticRegression.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E19_LinearRidgeRegression.java b/src/main/java/ai/preferred/regression/exercise/E19_LinearRidgeRegression.java new file mode 100644 index 0000000..5051965 --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E19_LinearRidgeRegression.java @@ -0,0 +1,22 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; +import ai.preferred.regression.TrainLinearRegression; + +public class E19_LinearRidgeRegression { + + /** + * DATA: data/amazon.csv + *

+ * TODO: + * Build and evaluate a linear regression model with ridge = {0.1, 1.0, 10.0}. Data split: 80/20. + *

+ * CHECK: Which ridge parameter gives the best RMSE (on testing)? + */ + public static void main(String[] args) { + Shell.reset(); + + Shell.help(TrainLinearRegression.class); + } + +} diff --git a/src/main/java/ai/preferred/regression/exercise/E20_GrandFinale.java b/src/main/java/ai/preferred/regression/exercise/E20_GrandFinale.java new file mode 100644 index 0000000..97e85fc --- /dev/null +++ b/src/main/java/ai/preferred/regression/exercise/E20_GrandFinale.java @@ -0,0 +1,19 @@ +package ai.preferred.regression.exercise; + +import ai.preferred.regression.Shell; + +public class E20_GrandFinale { + + /** + * DATA: data/amazon_extended.csv + *

+ * TODO: + * Build and evaluate a regression model for rating prediction! Data split: 80/20. + *

+ * CHECK: What is the best ACCURACY (on testing) you can get? + */ + public static void main(String[] args) { + Shell.reset(); + } + +} diff --git a/src/main/java/ai/preferred/regression/io/ARFFDataReader.java b/src/main/java/ai/preferred/regression/io/ARFFDataReader.java new file mode 100644 index 0000000..b00b98f --- /dev/null +++ b/src/main/java/ai/preferred/regression/io/ARFFDataReader.java @@ -0,0 +1,113 @@ +package ai.preferred.regression.io; + +import weka.core.*; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Set; +import java.util.TreeSet; + +public class ARFFDataReader { + + private static double parseDouble(ArrayList record, int row, int col) throws IOException { + try { + return Double.parseDouble(record.get(col)); + } catch (NumberFormatException | NullPointerException e) { + throw new IOException("A number expected! (row = " + row + "; col = " + col + ")"); + } + } + + private final boolean nominal; + private final boolean parseHeader; + private final ArrayList signature; + + public ARFFDataReader(File signatureFile, boolean nominal, boolean parseHeader) throws IOException { + this.nominal = nominal; + this.parseHeader = parseHeader; + try (final CSVInputData data = new CSVInputData(signatureFile, parseHeader)) { + final ArrayList signature = new ArrayList<>(); + if (nominal) { + final Set attributeValueSet = new TreeSet<>(); + ArrayList firstRecord = null; + int row = parseHeader ? 1 : 0; + for (final ArrayList record : data) { + if (firstRecord == null) { + firstRecord = record; + } + attributeValueSet.add(record.get(0)); + for (int col = 1; col < record.size(); col++) { + parseDouble(record, row, col); + } + row++; + } + if (firstRecord == null) { + throw new IOException("There is no records in the CSV file!"); + } + if (data.hasHeader()) { + final ArrayList header = data.getHeader(); + signature.add(new Attribute(header.get(0), new ArrayList<>(attributeValueSet))); + for (int i = 1; i < header.size(); i++) { + signature.add(new Attribute(header.get(i))); + } + } else { + signature.add(new Attribute("Y", new ArrayList<>(attributeValueSet))); + for (int i = 1; i < firstRecord.size(); i++) { + signature.add(new Attribute("X" + i)); + } + } + } else { + int row = parseHeader ? 1 : 0; + if (data.hasHeader()) { + final ArrayList header = data.getHeader(); + signature.add(new Attribute(header.get(0))); + for (int i = 1; i < header.size(); i++) { + signature.add(new Attribute(header.get(i))); + } + } + for (final ArrayList record : data) { + if (signature.isEmpty()) { + signature.add(new Attribute("Y")); + for (int i = 1; i < record.size(); i++) { + signature.add(new Attribute("X" + i)); + } + } + for (int col = 0; col < record.size(); col++) { + parseDouble(record, row, col); + } + } + if (signature.isEmpty()) { + throw new IOException("There is no records in the CSV file!"); + } + } + this.signature = signature; + } + } + + public ArrayList getSignature() { + return new ArrayList<>(signature); + } + + public Instances read(File file) throws IOException { + final Instances instances = new Instances("DATA", signature, 100); + instances.setClassIndex(0); + try (final CSVInputData data = new CSVInputData(file, parseHeader)) { + int row = parseHeader ? 1 : 0; + for (final ArrayList record : data) { + final Instance instance = new DenseInstance(instances.numAttributes()); + for (int i = 1; i < record.size(); i++) { + instance.setValue(i, parseDouble(record, row, i)); + } + if (nominal) { + instance.setValue(0, signature.get(0).indexOfValue(record.get(0))); + } else { + instance.setValue(0, parseDouble(record, row, 0)); + } + instances.add(new SparseInstance(instance)); + row++; + } + } + return instances; + } + +} diff --git a/src/main/java/ai/preferred/regression/io/CSVInputData.java b/src/main/java/ai/preferred/regression/io/CSVInputData.java new file mode 100644 index 0000000..9383515 --- /dev/null +++ b/src/main/java/ai/preferred/regression/io/CSVInputData.java @@ -0,0 +1,132 @@ +package ai.preferred.regression.io; + +import com.google.common.collect.Lists; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Iterator; + +public class CSVInputData implements Iterable>, AutoCloseable { + + private final ArrayList header; + private final boolean parseHeader; + private final File file; + + public CSVInputData(File file, boolean parseHeader) throws IOException { + this.file = file; + this.parseHeader = parseHeader; + if (parseHeader) { + header = parseHeader(); + } else { + header = null; + } + } + + private ArrayList parseHeader() throws IOException { + final CSVParser parser = newParser(); + final Iterator iterator = parser.iterator(); + if (!iterator.hasNext()) { + throw new IOException("The header record is not found!"); + } + CSVRecord headerRecord = iterator.next(); + parser.close(); + return Lists.newArrayList(headerRecord); + } + + public boolean hasHeader() { + return header != null; + } + + public ArrayList getHeader() { + if (header == null) { + throw new UnsupportedOperationException("This CSV file has no header!"); + } + return new ArrayList<>(header); + } + + public ArrayList> getRecords() throws IOException { + final CSVParser parser = newParser(); + final Iterator iterator = parser.iterator(); + final ArrayList> data = new ArrayList<>(); + skipHeaderIfExists(iterator); + while (iterator.hasNext()) { + data.add(Lists.newArrayList(iterator.next())); + } + parser.close(); + return data; + } + + private void skipHeaderIfExists(Iterator iterator) throws IOException { + if (parseHeader) { + if (!iterator.hasNext()) { + throw new IOException("The header record is not found!"); + } + iterator.next(); + } + } + + @Override + public Iterator> iterator() { + try { + return new Iter(); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } + + private CSVParser newParser() throws IOException { + return CSVParser.parse(file, StandardCharsets.UTF_8, CSVFormat.EXCEL); + } + + @Override + public void close() { + // do nothing + } + + private class Iter implements Iterator>, Closeable { + + private final CSVParser parser; + private final Iterator innerIter; + + Iter() throws IOException { + parser = CSVInputData.this.newParser(); + innerIter = parser.iterator(); + skipHeaderIfExists(innerIter); + } + + @Override + public boolean hasNext() { + final boolean hasNext = innerIter.hasNext(); + if (!hasNext) { + try { + parser.close(); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } + return hasNext; + } + + @Override + public ArrayList next() { + return Lists.newArrayList(innerIter.next()); + } + + @Override + public void close() throws IOException { + parser.close(); + } + + @Override + protected void finalize() throws Throwable { + close(); + } + + } +} diff --git a/src/main/java/ai/preferred/regression/io/CSVUtils.java b/src/main/java/ai/preferred/regression/io/CSVUtils.java new file mode 100644 index 0000000..60abd40 --- /dev/null +++ b/src/main/java/ai/preferred/regression/io/CSVUtils.java @@ -0,0 +1,35 @@ +package ai.preferred.regression.io; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVPrinter; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; + +public class CSVUtils { + + public static CSVInputData reader(File file, boolean header) throws IOException { + return new CSVInputData(file, header); + } + + public static CSVPrinter printer(File file) throws IOException { + return new CSVPrinter(new OutputStreamWriter(new FileOutputStream(file, false), StandardCharsets.UTF_8), CSVFormat.EXCEL); + } + + @SafeVarargs + public static String[] toStringArray(T... values) { + final String[] strings = new String[values.length]; + for (int i = 0; i < values.length; i++) { + strings[i] = String.valueOf(values[i]); + } + return strings; + } + + private CSVUtils() { + throw new AssertionError(); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/AddX2.java b/src/main/java/ai/preferred/regression/pe/AddX2.java new file mode 100644 index 0000000..18fd1bf --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/AddX2.java @@ -0,0 +1,40 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; + +public class AddX2 extends ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(AddX2.class); + + @Option(name = "-c", aliases = {"--column"}, usage = "the index of the column", required = true) + private int column; + + @Override + protected void process(CSVInputData data, CSVPrinter printer) throws IOException { + if (data.hasHeader()) { + final ArrayList header = data.getHeader(); + header.add("(" + header.get(column) + ")^2"); + printer.printRecord(header); + } + + for (final ArrayList record : data) { + final String value = record.get(column); + final double x = Double.parseDouble(value); + final double x2 = x * x; + record.add(String.valueOf(x2)); + printer.printRecord(record); + } + } + + public static void main(String[] args) { + parseArgsAndRun(AddX2.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/AddX3.java b/src/main/java/ai/preferred/regression/pe/AddX3.java new file mode 100644 index 0000000..7847392 --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/AddX3.java @@ -0,0 +1,41 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; + +public class AddX3 extends ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(AddX3.class); + + @Option(name = "-c", aliases = {"--column"}, usage = "the index of the column", required = true) + private int column; + + @Override + protected void process(CSVInputData data, CSVPrinter printer) throws IOException { + if (data.hasHeader()) { + ArrayList header = data.getHeader(); + // TODO: transform this header here! + // FOR EXAMPLE: + // header.add("NEW_COLUMN"); + printer.printRecord(header); + } + + for (final ArrayList record : data) { + // TODO: transform each record here! + // FOR EXAMPLE: + // record.add("VALUE"); + printer.printRecord(record); + } + } + + public static void main(String[] args) { + parseArgsAndRun(AddX3.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/Dummy.java b/src/main/java/ai/preferred/regression/pe/Dummy.java new file mode 100644 index 0000000..755ae22 --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/Dummy.java @@ -0,0 +1,42 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; + +public class Dummy extends ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(Dummy.class); + + // TODO: add your options! + @Option(name = "-z", aliases = {"--option-z"}) + private boolean option = false; + + @Override + protected void process(CSVInputData data, CSVPrinter printer) throws IOException { + if (data.hasHeader()) { + ArrayList header = data.getHeader(); + // TODO: transform this header here! + // FOR EXAMPLE: + // header.add("NEW_COLUMN"); + printer.printRecord(header); + } + + for (ArrayList record : data) { + // TODO: transform each record here! + // FOR EXAMPLE: + // record.add("VALUE"); + printer.printRecord(record); + } + } + + public static void main(String[] args) { + parseArgsAndRun(Dummy.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/EncodeTextAsFrequency.java b/src/main/java/ai/preferred/regression/pe/EncodeTextAsFrequency.java new file mode 100644 index 0000000..29975d4 --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/EncodeTextAsFrequency.java @@ -0,0 +1,121 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import ai.preferred.regression.io.CSVUtils; +import ai.preferred.regression.pe.data.Vocabulary; +import com.google.common.collect.HashMultiset; +import com.google.common.collect.ImmutableMultiset; +import com.google.common.collect.Multiset; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.regex.Pattern; + +public class EncodeTextAsFrequency extends ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(EncodeTextAsFrequency.class); + + @Option(name = "-c", aliases = {"--column"}, usage = "the index of the input column", required = true) + private int column; + + @Option(name = "-s", aliases = {"--separator"}, usage = "specifies regular expression for splitting text into words") + private String separator = "\\W+"; + + @Option(name = "-n", aliases = {"--number-of-words"}, usage = "the maximum number of words to keep") + private int numberOfWords = 1000; + + @Option(name = "-p", aliases = {"--prefix"}, usage = "the prefix of the new columns") + private String prefix = "WORD:"; + + private static Comparator> getDecreasingCountComparator() { + return (entry1, entry2) -> Integer.compare(entry2.getCount(), entry1.getCount()); + } + + private static String[] toLowerCase(String[] words) { + final String[] result = new String[words.length]; + for (int i = 0; i < words.length; i++) { + result[i] = words[i].toLowerCase(); + } + return result; + } + + private static String[] trimEmpty(String[] words) { + final ArrayList result = new ArrayList<>(); + for (final String word : words) { + if (!word.trim().isEmpty()) { + result.add(word); + } + } + return result.toArray(new String[0]); + } + + private static Multiset toBagOfWords(String text, String separator) { + final Pattern tokenizer = Pattern.compile(separator); + String[] words; + words = tokenizer.split(text); + words = trimEmpty(words); + words = toLowerCase(words); + return ImmutableMultiset.copyOf(toLowerCase(words)); + } + + private Vocabulary buildVocabulary(CSVInputData reader, int numberOfWords) { + final Multiset vocabulary = HashMultiset.create(); + + for (final ArrayList values : reader) { + final String text = values.get(column); + vocabulary.addAll(toBagOfWords(text, separator)); + } + + final ArrayList> highestCountFirst = new ArrayList<>(vocabulary.entrySet()); + highestCountFirst.sort(getDecreasingCountComparator()); + + final ArrayList wordsToRetain = new ArrayList<>(numberOfWords); + for (final Multiset.Entry e : highestCountFirst.subList(0, Math.min(highestCountFirst.size(), numberOfWords))) { + wordsToRetain.add(e.getElement()); + } + + return new Vocabulary(wordsToRetain); + } + + @Override + protected void process(CSVInputData data, CSVPrinter printer) throws IOException { + final Vocabulary vocabulary = buildVocabulary(data, numberOfWords); + + if (data.hasHeader()) { + final ArrayList header = data.getHeader(); + header.remove(column); + for (final String h : vocabulary.getVocabularyList()) { + header.add(prefix + h); + } + printer.printRecord(header); + } + + for (final ArrayList record : data) { + final Multiset bagOfWords = toBagOfWords(record.get(column), separator); + final Integer[] vDocument = new Integer[vocabulary.size()]; + Arrays.fill(vDocument, 0); + for (final Multiset.Entry entry : bagOfWords.entrySet()) { + final int index = vocabulary.getIndex(entry.getElement()); + if (index == -1) { + continue; + } + vDocument[index] = entry.getCount(); + } + record.remove(column); + Collections.addAll(record, CSVUtils.toStringArray(vDocument)); + printer.printRecord(record); + } + } + + public static void main(String[] args) { + parseArgsAndRun(EncodeTextAsFrequency.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/EncodeValueAsOneHot.java b/src/main/java/ai/preferred/regression/pe/EncodeValueAsOneHot.java new file mode 100644 index 0000000..c70ec3d --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/EncodeValueAsOneHot.java @@ -0,0 +1,61 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import ai.preferred.regression.io.CSVUtils; +import ai.preferred.regression.pe.data.Vocabulary; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.*; + +public class EncodeValueAsOneHot extends ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(EncodeValueAsOneHot.class); + + @Option(name = "-c", aliases = {"--column"}, usage = "the index of the input column", required = true) + private int column; + + @Option(name = "-p", aliases = {"--prefix"}, usage = "the prefix of the new columns") + private String prefix = "VALUE:"; + + @Override + protected void process(CSVInputData data, CSVPrinter printer) throws IOException { + final Vocabulary vocabulary = buildVocabulary(data); + + if (data.hasHeader()) { + final ArrayList header = data.getHeader(); + header.remove(column); + for (final String h : vocabulary.getVocabularyList()) { + header.add(prefix + h); + } + printer.printRecord(header); + } + + for (final ArrayList record : data) { + final Integer[] vOneHot = new Integer[vocabulary.size()]; + Arrays.fill(vOneHot, 0); + final int index = vocabulary.getIndex(record.get(column)); + vOneHot[index] = 1; + record.remove(column); + Collections.addAll(record, CSVUtils.toStringArray(vOneHot)); + printer.printRecord(record); + } + } + + private Vocabulary buildVocabulary(CSVInputData reader) { + final Set vocabulary = new HashSet<>(); + for (final ArrayList record : reader) { + vocabulary.add(record.get(column)); + } + + return new Vocabulary(vocabulary); + } + + public static void main(String[] args) { + parseArgsAndRun(EncodeValueAsOneHot.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/Partition.java b/src/main/java/ai/preferred/regression/pe/Partition.java new file mode 100644 index 0000000..4e7e31b --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/Partition.java @@ -0,0 +1,42 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; + +public class Partition extends ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(Partition.class); + + @Option(name = "-p", aliases = {"--proportion"}, usage = "the proportion of data to be selected or excluded (ranges from 0.0 to 1.0)") + private double percent = 0.8; + + @Option(name = "-e", aliases = {"--exclude"}, usage = "takes the other half of the selection if specified") + private boolean exclude = false; + + @Override + protected void process(CSVInputData reader, CSVPrinter printer) throws IOException { + if (reader.hasHeader()) { + printer.printRecord(reader.getHeader()); + } + + final ArrayList> data = reader.getRecords(); + final int n = (int)Math.round(percent * data.size()); + + if (exclude) { + printer.printRecords(data.subList(n, data.size())); + } else { + printer.printRecords(data.subList(0, n)); + } + } + + public static void main(String[] args) { + parseArgsAndRun(Partition.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/ProcessingElement.java b/src/main/java/ai/preferred/regression/pe/ProcessingElement.java new file mode 100644 index 0000000..1478193 --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/ProcessingElement.java @@ -0,0 +1,88 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import ai.preferred.regression.io.CSVUtils; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.CmdLineException; +import org.kohsuke.args4j.CmdLineParser; +import org.kohsuke.args4j.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; + +public abstract class ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(ProcessingElement.class); + + @Option(name = "-i", aliases = {"--input"}, usage = "the path to the input CSV file", required = true) + private File input; + + @Option(name = "-o", aliases = {"--output"}, usage = "the path to the output CSV file", required = true) + private File output; + + @Option(name = "-h", aliases = {"--header"}, usage = "specifies if the input CSV files have headers") + private boolean header = true; + + public ProcessingElement() { + } + + protected abstract void process(CSVInputData data, CSVPrinter printer) throws Exception; + + protected static void parseArgsAndRun(Class clazz, String[] args) { + ProcessingElement processingElement = null; + try { + processingElement = clazz.newInstance(); + } catch (IllegalAccessException | InstantiationException e) { + System.err.println("Please check if there is the public default constructor for the class: " + clazz.getCanonicalName()); + System.exit(1); + } + + if (args == null) { + System.out.println("=========== HELP ==========="); + System.out.println(); + System.out.println("Processing Element: " + clazz.getSimpleName() + ".class"); + System.out.println(); + System.out.println("Shell.run(" + clazz.getSimpleName() + ".class, \"\");"); + final CmdLineParser parser = new CmdLineParser(processingElement); + System.out.println(); + parser.printUsage(System.out); + System.out.println(); + System.out.println("============================"); + System.out.println(); + System.out.println(); + return; + } + + final CmdLineParser parser = new CmdLineParser(processingElement); + try { + parser.parseArgument(args); + } catch (CmdLineException e) { + System.err.println("ProcessingElement: " + clazz.getCanonicalName()); + System.err.println(e.getMessage()); + System.err.println(); + parser.printUsage(System.err); + System.exit(1); + } + + if (processingElement.input.equals(processingElement.output)) { + LOGGER.error("The input and output files point to the same location: {}", processingElement.input); + System.exit(1); + } + + try (final CSVPrinter printer = CSVUtils.printer(processingElement.output); + final CSVInputData reader = CSVUtils.reader(processingElement.input, processingElement.header)) { + try { + processingElement.process(reader, printer); + } catch (Exception e) { + LOGGER.error("Unexpected error: ", e); + System.exit(1); + } + } catch (IOException e) { + LOGGER.error("Unable to process files: ", e); + System.exit(1); + } + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/ProjectColumns.java b/src/main/java/ai/preferred/regression/pe/ProjectColumns.java new file mode 100644 index 0000000..c376795 --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/ProjectColumns.java @@ -0,0 +1,61 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import org.kohsuke.args4j.spi.StringArrayOptionHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.SortedSet; +import java.util.TreeSet; + +public class ProjectColumns extends ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(ProjectColumns.class); + + @Option(name = "-c", aliases = {"--columns"}, usage = "the column names separated by spaces", handler = StringArrayOptionHandler.class, required = true) + private String[] columns = new String[0]; + + private static SortedSet indicesOf(ArrayList header, String[] columns) { + final SortedSet indices = new TreeSet<>(); + for (final String name : columns) { + int index = header.indexOf(name); + if (index > -1) { + indices.add(index); + } + } + return indices; + } + + private static ArrayList projectIndices(ArrayList list, SortedSet indices) { + final ArrayList projection = new ArrayList<>(indices.size()); + for (int index : indices) { + projection.add(list.get(index)); + } + return projection; + } + + @Override + protected void process(CSVInputData data, CSVPrinter printer) throws IOException { + if (!data.hasHeader()) { + throw new IllegalArgumentException("ProjectColumns requires CSV with header!"); + } + + final ArrayList header = data.getHeader(); + final SortedSet indices = indicesOf(header, columns); + printer.printRecord(projectIndices(header, indices)); + + for (final ArrayList record : data) { + printer.printRecord(projectIndices(record, indices)); + } + + } + + public static void main(String[] args) { + parseArgsAndRun(ProjectColumns.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/RemoveColumn.java b/src/main/java/ai/preferred/regression/pe/RemoveColumn.java new file mode 100644 index 0000000..1b14bea --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/RemoveColumn.java @@ -0,0 +1,37 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; + +public class RemoveColumn extends ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(RemoveColumn.class); + + @Option(name = "-c", aliases = {"--column"}, usage = "the index of the column to be dropped", required = true) + private int column; + + @Override + protected void process(CSVInputData data, CSVPrinter printer) throws IOException { + if (data.hasHeader()) { + final ArrayList header = data.getHeader(); + header.remove(column); + printer.printRecord(header); + } + + for (final ArrayList record : data) { + record.remove(column); + printer.printRecord(record); + } + } + + public static void main(String[] args) { + parseArgsAndRun(RemoveColumn.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/SelectEquals.java b/src/main/java/ai/preferred/regression/pe/SelectEquals.java new file mode 100644 index 0000000..21aa719 --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/SelectEquals.java @@ -0,0 +1,39 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; + +public class SelectEquals extends ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(SelectEquals.class); + + @Option(name = "-c", aliases = {"--column"}, usage = "the index of the input column", required = true) + private int column; + + @Option(name = "-e", aliases = {"--equals"}, usage = "the value to be verified", required = true) + private String value; + + @Override + protected void process(CSVInputData data, CSVPrinter printer) throws IOException { + if (data.hasHeader()) { + printer.printRecord(data.getHeader()); + } + + for (final ArrayList record : data) { + if (value.equals(record.get(column))) { + printer.printRecord(record); + } + } + } + + public static void main(String[] args) { + parseArgsAndRun(SelectEquals.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/Shuffle.java b/src/main/java/ai/preferred/regression/pe/Shuffle.java new file mode 100644 index 0000000..cafc95c --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/Shuffle.java @@ -0,0 +1,36 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Random; + +public class Shuffle extends ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(Shuffle.class); + + @Option(name = "-s", aliases = {"--seed"}, usage = "random seed") + private long seed = 1; + + @Override + protected void process(CSVInputData reader, CSVPrinter printer) throws IOException { + if (reader.hasHeader()) { + printer.printRecord(reader.getHeader()); + } + + final ArrayList> data = reader.getRecords(); + Collections.shuffle(data, new Random(seed)); + printer.printRecords(data); + } + + public static void main(String[] args) { + parseArgsAndRun(Shuffle.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/SwapColumns.java b/src/main/java/ai/preferred/regression/pe/SwapColumns.java new file mode 100644 index 0000000..8e0b693 --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/SwapColumns.java @@ -0,0 +1,41 @@ +package ai.preferred.regression.pe; + +import ai.preferred.regression.io.CSVInputData; +import org.apache.commons.csv.CSVPrinter; +import org.kohsuke.args4j.Option; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; + +public class SwapColumns extends ProcessingElement { + + private static final Logger LOGGER = LoggerFactory.getLogger(SwapColumns.class); + + @Option(name = "-x", aliases = {"--column-x"}, usage = "the index of one column to be swapped", required = true) + private int column1; + + @Option(name = "-y", aliases = {"--column-y"}, usage = "the index of the other column to be swapped", required = true) + private int column2; + + @Override + protected void process(CSVInputData data, CSVPrinter printer) throws IOException { + if (data.hasHeader()) { + final ArrayList header = data.getHeader(); + Collections.swap(header, column1, column2); + printer.printRecord(header); + } + + for (final ArrayList record : data) { + Collections.swap(record, column1, column2); + printer.printRecord(record); + } + } + + public static void main(String[] args) { + parseArgsAndRun(SwapColumns.class, args); + } + +} diff --git a/src/main/java/ai/preferred/regression/pe/data/Vocabulary.java b/src/main/java/ai/preferred/regression/pe/data/Vocabulary.java new file mode 100644 index 0000000..ad621b6 --- /dev/null +++ b/src/main/java/ai/preferred/regression/pe/data/Vocabulary.java @@ -0,0 +1,50 @@ +package ai.preferred.regression.pe.data; + +import java.util.*; + +public class Vocabulary { + + private final ArrayList vocabularyList; + private final Map vocabularyMap; + + public Vocabulary(Collection vocabulary) { + vocabularyList = new ArrayList<>(vocabulary); + Collections.sort(vocabularyList); + vocabularyMap = new HashMap<>(vocabularyList.size()); + for (final String w : vocabularyList) { + vocabularyMap.put(w, vocabularyMap.size()); + } + } + + public List getVocabularyList() { + return Collections.unmodifiableList(vocabularyList); + } + + public String[] getVocabularyArray() { + return vocabularyList.toArray(new String[0]); + } + + public int getIndex(String w) { + final Integer index = vocabularyMap.get(w); + if (index == null) { + return 0; + } + return index; + } + + public String getWord(int index) { + if (index >= 0 && index < vocabularyList.size()) { + return vocabularyList.get(index); + } + throw new IllegalArgumentException("No such index in the vocabulary: " + index); + } + + public int size() { + return vocabularyList.size(); + } + + @Override + public String toString() { + return "Vocabulary{" + vocabularyList + '}'; + } +} diff --git a/src/main/java/ai/preferred/regression/plot/XYChart.java b/src/main/java/ai/preferred/regression/plot/XYChart.java new file mode 100644 index 0000000..d993b2b --- /dev/null +++ b/src/main/java/ai/preferred/regression/plot/XYChart.java @@ -0,0 +1,41 @@ +package ai.preferred.regression.plot; + +import org.jfree.chart.ChartFactory; +import org.jfree.chart.ChartPanel; +import org.jfree.chart.JFreeChart; +import org.jfree.chart.plot.XYPlot; +import org.jfree.chart.renderer.xy.XYLineAndShapeRenderer; +import org.jfree.data.xy.XYDataset; +import org.jfree.data.xy.XYSeries; +import org.jfree.data.xy.XYSeriesCollection; + +import javax.swing.*; +import java.awt.*; + +public class XYChart extends JFrame { + + private static final long serialVersionUID = 1L; + + public XYChart(String chartTitle, XYSeries data, XYSeries line) { + super("Linear Regression Plotter"); + final XYSeriesCollection collection = new XYSeriesCollection(); + collection.addSeries(data); + collection.addSeries(line); + final ChartPanel panel = new ChartPanel(createChart(collection, chartTitle)); + panel.setPreferredSize(new Dimension(640, 480)); + setContentPane(panel); + } + + private JFreeChart createChart(XYDataset dataset, String title) { + final JFreeChart chart = ChartFactory.createXYLineChart(title, "X", "Y", dataset); + final XYPlot plot = chart.getXYPlot(); + final XYLineAndShapeRenderer renderer = new XYLineAndShapeRenderer(); + renderer.setSeriesLinesVisible(0, false); + renderer.setSeriesShapesVisible(0, true); + renderer.setSeriesLinesVisible(1, true); + renderer.setSeriesShapesVisible(1, false); + plot.setRenderer(renderer); + return chart; + } + +} \ No newline at end of file diff --git a/src/main/java/ai/preferred/regression/reset/DataFiles.java b/src/main/java/ai/preferred/regression/reset/DataFiles.java new file mode 100644 index 0000000..83164cd --- /dev/null +++ b/src/main/java/ai/preferred/regression/reset/DataFiles.java @@ -0,0 +1,207 @@ +package ai.preferred.regression.reset; + +public class DataFiles { + + public static final String ICECREAM_CSV = "Consumption,Temperature\n" + + "0.386,5.00\n" + + "0.374,13.33\n" + + "0.393,17.22\n" + + "0.425,20.00\n" + + "0.406,20.56\n" + + "0.344,18.33\n" + + "0.327,16.11\n" + + "0.288,8.33\n" + + "0.269,0.00\n" + + "0.256,-4.44\n" + + "0.286,-2.22\n" + + "0.298,-3.33\n" + + "0.329,0.00\n" + + "0.318,4.44\n" + + "0.381,12.78\n" + + "0.381,17.22\n" + + "0.47,22.22\n" + + "0.443,22.22\n" + + "0.386,19.44\n" + + "0.342,15.56\n" + + "0.319,6.67\n" + + "0.307,4.44\n" + + "0.284,0.00\n" + + "0.326,-2.78\n" + + "0.309,-2.22\n" + + "0.359,0.56\n" + + "0.376,5.00\n" + + "0.416,11.11\n" + + "0.437,17.78\n" + + "0.548,21.67\n"; + + public static final String ICECREAM_RAW_CSV = "Id,Temperature,Consumption\n" + + "1,5.00,0.386\n" + + "2,13.33,0.374\n" + + "3,17.22,0.393\n" + + "4,20.00,0.425\n" + + "5,20.56,0.406\n" + + "6,18.33,0.344\n" + + "7,16.11,0.327\n" + + "8,8.33,0.288\n" + + "9,0.00,0.269\n" + + "10,-4.44,0.256\n" + + "11,-2.22,0.286\n" + + "12,-3.33,0.298\n" + + "13,0.00,0.329\n" + + "14,4.44,0.318\n" + + "15,12.78,0.381\n" + + "16,17.22,0.381\n" + + "17,22.22,0.47\n" + + "18,22.22,0.443\n" + + "19,19.44,0.386\n" + + "20,15.56,0.342\n" + + "21,6.67,0.319\n" + + "22,4.44,0.307\n" + + "23,0.00,0.284\n" + + "24,-2.78,0.326\n" + + "25,-2.22,0.309\n" + + "26,0.56,0.359\n" + + "27,5.00,0.376\n" + + "28,11.11,0.416\n" + + "29,17.78,0.437\n" + + "30,21.67,0.548\n"; + + public static final String AMAZON_CSV = "Id,Rating,Text\n" + + "1,5,I only spent less than ten on these so they're good for what I paid for\n" + + "2,5,I'm in love with these glasses.\n" + + "3,5,Stylish. My kid loved them\n" + + "4,5,They came in great condition.\n" + + "5,5,These are really wonderful!\n" + + "6,5,these are GREAT quality\n" + + "7,5,She LOVES them!\n" + + "8,5,Love these.\n" + + "9,5,The quality is pretty good also.\n" + + "10,5,EXCELLENT PRODUCT\n" + + "11,5,I love them. Exactly what i wanted.\n" + + "12,5,Son love them\n" + + "13,5,He says they give him that style.\n" + + "14,5,Great value!!!\n" + + "15,5,Very complimentary!\n" + + "16,5,\"Cute, great quality, good fit.\"\n" + + "17,5,I love these glasses!!\n" + + "18,5,they fit perfectly.\n" + + "19,5,They look expensive and the fit is perfect\n" + + "20,5,Sturdy and good looking for a great price\n" + + "21,5,Very stylish! Great accessory to compliment an outfit\n" + + "22,5,Thanks so much my grandson enjoy them.\n" + + "23,5,Daughter loves them.\n" + + "24,5,Makes me look smarter in my tinder profile !\n" + + "25,1,the side arms keep breaking\n" + + "26,1,just look soooo cheap!\n" + + "27,1,Not my style.\n" + + "28,1,Mine arrived broken!! Not worth sending back.\n" + + "29,1,Dollar store quality.\n" + + "30,1,Not like picture.\n" + + "31,1,We're cheap and broke right away. \n" + + "32,1,\"These are so cheap looking, they are unwearable.\"\n" + + "33,1,Very Very VERY Round ! Not at all vintage .\n" + + "34,1,It's a peace of garbage. Feels so cheap and plastic.\n" + + "35,1,feel flimsy like it would break i returned it the next day\n" + + "36,1,lens have too much glare\n" + + "37,1,they look cheaply made and plastic\n" + + "38,1,Very cheap looking\n" + + "39,1,make me headache\n" + + "40,1,Feel apart after a week of getting them prescribed.\n" + + "41,1,Really cheap looking.\n" + + "42,1,Glasses are crooked and not made correctly.\n" + + "43,1,poorly made... broke after three days\n" + + "44,1,Sunglasses were very small.\n" + + "45,1,Look fake and cheap\n" + + "46,1,lens fell out on first day.\n" + + "47,1,Not really like it!\n" + + "48,1,Delivered broken.\n" + + "49,1,\"Overall, trash.\"\n" + + "50,1,\"Were broken when I opened the box, very disappointed\"\n" + + "51,1,Horrible lens fell out 2nd day!\n" + + "52,1,I don't like them.\n" + + "53,1,These hoes broke too I want my money\n" + + "54,1,Broke within the 3 days\n" + + "55,1,Little small but still good\n" + + "56,1,Super small\n" + + "57,1,Horrible desing\n" + + "58,1,It's broke\n" + + "59,1,Crooked and cheaply made.\n" + + "60,1,Poor quality\n"; + + public static final String CAMERA_CSV = "Id,Price (USD),Type,Focus\n" + + "1,949,MIRRORLESS,MANUAL\n" + + "2,99,DSLR,BOTH\n" + + "3,90,DSLR,BOTH\n" + + "4,80,DSLR,AUTO\n" + + "5,20,COMPACT,MANUAL\n" + + "6,50,COMPACT,AUTO\n" + + "7,49,COMPACT,AUTO\n" + + "8,30,COMPACT,AUTO\n" + + "9,800,MIRRORLESS,AUTO\n" + + "10,789,MIRRORLESS,MANUAL\n" + + "11,35,COMPACT,AUTO\n" + + "12,789,MIRRORLESS,BOTH\n"; + + public static final String AMAZON_EXTENDED = "Id,Rating,Text,Verified Purchase,Helpful\n" + + "1,5,I only spent less than ten on these so they're good for what I paid for,YES,11\n" + + "2,5,I'm in love with these glasses.,YES,2\n" + + "3,5,Stylish. My kid loved them,YES,2\n" + + "4,5,They came in great condition.,NO,0\n" + + "5,5,These are really wonderful!,YES,0\n" + + "6,5,these are GREAT quality,YES,0\n" + + "7,5,She LOVES them!,YES,0\n" + + "8,5,Love these.,YES,0\n" + + "9,5,The quality is pretty good also.,YES,3\n" + + "10,5,EXCELLENT PRODUCT,YES,0\n" + + "11,5,I love them. Exactly what i wanted.,NO,0\n" + + "12,5,Son love them,YES,0\n" + + "13,5,He says they give him that style.,YES,0\n" + + "14,5,Great value!!!,YES,0\n" + + "15,5,Very complimentary!,YES,0\n" + + "16,5,\"Cute, great quality, good fit.\",YES,1\n" + + "17,5,I love these glasses!!,YES,0\n" + + "18,5,they fit perfectly.,YES,0\n" + + "19,5,They look expensive and the fit is perfect,NO,0\n" + + "20,5,Sturdy and good looking for a great price,YES,0\n" + + "21,5,Very stylish! Great accessory to compliment an outfit,YES,0\n" + + "22,5,Thanks so much my grandson enjoy them.,YES,0\n" + + "23,5,Daughter loves them.,YES,0\n" + + "24,5,Makes me look smarter in my tinder profile !,YES,0\n" + + "25,1,the side arms keep breaking,YES,0\n" + + "26,1,just look soooo cheap!,NO,0\n" + + "27,1,Not my style.,NO,0\n" + + "28,1,Mine arrived broken!! Not worth sending back.,YES,0\n" + + "29,1,Dollar store quality.,YES,5\n" + + "30,1,Not like picture.,NO,0\n" + + "31,1,We're cheap and broke right away. ,YES,0\n" + + "32,1,\"These are so cheap looking, they are unwearable.\",YES,0\n" + + "33,1,Very Very VERY Round ! Not at all vintage .,YES,0\n" + + "34,1,It's a peace of garbage. Feels so cheap and plastic.,YES,0\n" + + "35,1,feel flimsy like it would break i returned it the next day,NO,0\n" + + "36,1,lens have too much glare,YES,0\n" + + "37,1,they look cheaply made and plastic,YES,0\n" + + "38,1,Very cheap looking,NO,0\n" + + "39,1,make me headache,YES,0\n" + + "40,1,Feel apart after a week of getting them prescribed.,NO,0\n" + + "41,1,Really cheap looking.,YES,0\n" + + "42,1,Glasses are crooked and not made correctly.,YES,7\n" + + "43,1,poorly made... broke after three days,YES,2\n" + + "44,1,Sunglasses were very small.,NO,0\n" + + "45,1,Look fake and cheap,YES,0\n" + + "46,1,lens fell out on first day.,NO,0\n" + + "47,1,Not really like it!,YES,2\n" + + "48,1,Delivered broken.,YES,0\n" + + "49,1,\"Overall, trash.\",YES,0\n" + + "50,1,\"Were broken when I opened the box, very disappointed\",NO,0\n" + + "51,1,Horrible lens fell out 2nd day!,YES,5\n" + + "52,1,I don't like them.,NO,0\n" + + "53,1,These hoes broke too I want my money,NO,0\n" + + "54,1,Broke within the 3 days,NO,0\n" + + "55,1,Little small but still good,NO,4\n" + + "56,1,Super small,NO,0\n" + + "57,1,Horrible desing,NO,0\n" + + "58,1,It's broke,NO,0\n" + + "59,1,Crooked and cheaply made.,NO,0\n" + + "60,1,Poor quality,YES,0\n"; + +} diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties new file mode 100644 index 0000000..8ee929f --- /dev/null +++ b/src/main/resources/log4j.properties @@ -0,0 +1,22 @@ +log4j.rootLogger=ERROR, STDOUT +log4j.logger.ai.preferred.venom=INFO, STDOUT +log4j.logger.ai.preferred.venom.proxy=INFO, STDOUT +log4j.logger.ai.preferred.venom.storage=INFO, STDOUT +log4j.logger.ai.preferred.minerva=INFO, STDOUT +log4j.logger.ai.preferred.crawler=DEBUG, STDOUT + +log4j.additivity.ai.preferred.venom=false +log4j.additivity.ai.preferred.venom.proxy=false +log4j.additivity.ai.preferred.venom.storage=false +log4j.additivity.ai.preferred.minerva=false +log4j.additivity.ai.preferred.crawler=false + +log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender +log4j.appender.STDOUT.Target=System.out +log4j.appender.STDOUT.layout=org.apache.log4j.EnhancedPatternLayout +log4j.appender.STDOUT.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n%throwable +log4j.appender.FILE=org.apache.log4j.RollingFileAppender +log4j.appender.FILE.File=log.txt +log4j.appender.FILE.Append=true +log4j.appender.FILE.layout=org.apache.log4j.EnhancedPatternLayout +log4j.appender.FILE.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n%throwable \ No newline at end of file diff --git a/src/test/java/.gitkeep b/src/test/java/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/test/resources/.gitkeep b/src/test/resources/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/temp/.gitkeep b/temp/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/utils/assemble.py b/utils/assemble.py new file mode 100644 index 0000000..40ece2c --- /dev/null +++ b/utils/assemble.py @@ -0,0 +1,38 @@ +import os.path +import os +import re + +CODE_PATTERN = re.compile(r'.+main.+?{(.+)}.+}', re.S) + +def begin_section(name): + m = 23 - int(len(name) / 2) + padding = '=' * m + result = padding + ' ' + name + ' ' + padding + if len(result) % 2 == 0: + result += '=' + print(result) + +def end_section(): + print('=' * 49) + +def main(): + exercise_dir = '../src/main/java/ai/preferred/regression/exercise/' + for fn in os.listdir(exercise_dir): + if fn.startswith('E') and not fn.startswith('E20'): + name = fn.split('.')[0] + begin_section(name) + with open(os.path.join(exercise_dir, fn), 'r', encoding='utf8') as f: + m = CODE_PATTERN.search(f.read()) + if m: + print() + print('public static void main(String[] args) {') + print(m.group(1).strip(' ')) + print('}') + print() + end_section() + print() + print() + +if __name__ == '__main__': + main() +