Replaced sum() with pulp.lpSum() - Speed up + Cplex integration

UKPLab · Feb 14, 2018 · 50653be · 50653be
1 parent f89681e
commit 50653be
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -40,32 +40,46 @@ Installation
 
 1. Download ROUGE package from the [link](https://www.isi.edu/licensed-sw/see/rouge/) and place it in the rouge directory 
 
-        >> mv RELEASE-1.5.5 rouge/
+	```
+	mv RELEASE-1.5.5 rouge/
+	```
         
 
 2. Install required python packages.
 
-        >> pip install -r requirements.txt
-        
+	```
+	pip install -r requirements.txt
+	```     
 3. Download the Standford Parser models and jars from the [link](https://nlp.stanford.edu/software/lex-parser.shtml)
 
-		>> mv englishPCFG.ser.gz germanPCFG.ser.gz jars/
-		>> mv stanford-parser.jar stanford-parser-3.6.0-models.jar jars/		
+	```
+	mv englishPCFG.ser.gz germanPCFG.ser.gz jars/
+	mv stanford-parser.jar stanford-parser-3.6.0-models.jar jars/		
+	```
         
 4. [Optional] To run the system for active learning models
 
-		 Download the Google embeddings (English) from the [link](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/)
-		 
-		 >> mkdir -p summarizer/data/embeddings/english
-		 >> mv GoogleNews-vectors-negative300.bin.gz summarizer/data/embeddings/english
-		 
-		 Download the News, Wikipedia embeddings (German) from the [link](https://public.ukp.informatik.tu-darmstadt.de/reimers/2014_german_embeddings/2014_tudarmstadt_german_50mincount.vec)
-		 
-		 >> mkdir -p summarizer/data/embeddings/german
-		 >> mv 2014_tudarmstadt_german_50mincount.vec summarizer/data/embeddings/german
-		 
-		 
-ToRun
+	Download the Google embeddings (English) from the [link](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/)
+
+	```		 
+	mkdir -p summarizer/data/embeddings/english
+	mv GoogleNews-vectors-negative300.bin.gz summarizer/data/embeddings/english
+	```		 
+	Download the News, Wikipedia embeddings (German) from the [link](https://public.ukp.informatik.tu-darmstadt.de/reimers/2014_german_embeddings/2014_tudarmstadt_german_50mincount.vec)
+
+	```
+	mkdir -p summarizer/data/embeddings/german
+	mv 2014_tudarmstadt_german_50mincount.vec summarizer/data/embeddings/german
+	```
+
+5. [Optional] To solve ILPs using CPLEX (faster), which can be obtained from IBM here: [link](https://ibm.com/software/commerce/optimization/cplex-optimizer/). Install the cplex python package.
+
+    ``` 
+    cd cplex_installation_dir/python
+    python setup install
+    ```
+			 
+To Run
 -------
 
 1. Make sure that you have the raw datasets available. Each raw dataset needs to be extracted and follow the following directory structure:       
@@ -88,9 +102,10 @@ ToRun
 
 
 2. Before running the pipeline, you have to preprocess the raw datasets using the `make_data.py` script. Replace the DUC_TEST with appropriate dataset and run the same command.
-
-       python summarizer/data_processer/make_data.py -d DUC_TEST -p summarizer/data/raw  -a parse -l english
 
+	```    
+       python summarizer/data_processer/make_data.py -d DUC_TEST -p summarizer/data/raw  -a parse -l english
+	```
    The results should then be copied into a directory. We recommend using the `--iobasedir` argument to set the directory
 
         +--+datasets/
@@ -144,9 +159,10 @@ ToRun
 
 3. python pipeline.py --help for more details
 
-        python pipeline.py --summary_size=100 --oracle_type=accept_reject --data_set=DUC_TEST --summarizer_type=feedback
+    ```
+        python pipeline.py --summary_size=100 --oracle_type=accept_reject --data_set=DUC_TEST --summarizer_type=feedback --language=english
         pyhton pipeline.py --summary_size=100 --oracle_type=accept_reject --data_set=DUC_TEST --summarizer_type=baselines --language=english --rouge=rouge/RELEASE-1.5.5/ --iobasedir=outputs/
-
+	```
 
 Dataset notes
 =============
@@ -178,7 +194,9 @@ Verified by one (1) user.
 1. install perl module `XML::DOM`
 1. install python modules
 
-	pip install -r requirements.txt
+	```
+		pip install -r requirements.txt
+	```
 
 1. configure eclipse pydev run configuration as set up here: 
 

diff --git a/summarizer/algorithms/simulated_feedback.py b/summarizer/algorithms/simulated_feedback.py
@@ -24,6 +24,7 @@
 from summarizer.utils.data_helpers import prune_ngrams, extract_ngrams2, get_parse_info, \
     prune_phrases
 
+
 RECOMMENDER_METHOD_SAMPLING = "SAMPLING"
 RECOMMENDER_METHOD_HIGHEST_WEIGHT = "HIGHEST_WEIGHT"
 
@@ -380,17 +381,19 @@ def check_break_condition(self, iteration, prev_summary, summary, ub_summary, pr
             return 1
         return 0
 
-    def solve_joint_ilp(self, summary_size, feedback, non_feedback, uncertainity={}, labels={}, unique=False,
-                        solver='glpk', excluded_solutions=[]):
+    def solve_joint_ilp(self, summary_size, feedback, non_feedback, uncertainity={}, labels={}, unique=False, solver='glpk', excluded_solutions=[]):
         """
 
         :param summary_size: The size of the backpack. i.e. how many words are allowed in the summary.
         :param feedback:
         :param non_feedback:
         :param unique: if True, an boudin_2015 eq. (5) is applied to enforce a unique solution.
-        :param solver: default glpk
+        :param solver: cplex, if fails use the mentioned solver
         :param excluded_solutions:
-        :return:
+        
+        :return: (val, set) tuple (int, list): the value of the objective function and the set of
+            selected sentences as a tuple. 
+        
         """
         w = self.summarizer.weights
         u = uncertainity
@@ -442,31 +445,32 @@ def solve_joint_ilp(self, summary_size, feedback, non_feedback, uncertainity={},
         # OBJECTIVE FUNCTION
         if labels:
             print('solve for Active learning 2')
-            prob += sum(w[non_feedback[i]] * (1.0 - u[non_feedback[i]]) * labels[non_feedback[i]] * nf[i] for i in range(NF))
+            prob += pulp.lpSum(w[non_feedback[i]] * (1.0 - u[non_feedback[i]]) * labels[non_feedback[i]] * nf[i] for i in range(NF))
         if not labels:
             if uncertainity:
                 print('solve for Active learning')
                 if feedback:
                     # In this phase, we force new concepts to be chosen, and not those we already have feedback on, and
                     # therefore non_feedback is added while feedback is substracted from the problem. I.e. by
                     # substracting the feedback, those sentences will disappear from the solution.
-                    prob += sum(w[non_feedback[i]] * u[non_feedback[i]] * nf[i] for i in range(NF)) - sum(
+                    prob += pulp.lpSum(w[non_feedback[i]] * u[non_feedback[i]] * nf[i] for i in range(NF)) - pulp.lpSum(
                             w[feedback[i]] * u[feedback[i]] * f[i] for i in range(F))
+                    pulp.l
                 else:
-                    prob += sum(w[non_feedback[i]] * u[non_feedback[i]] * nf[i] for i in range(NF))
+                    prob += pulp.lpSum(w[non_feedback[i]] * u[non_feedback[i]] * nf[i] for i in range(NF))
             if not uncertainity:
                 print('solve for ILP feedback')
                 if feedback:
-                    prob += sum(w[non_feedback[i]] * nf[i] for i in range(NF)) - sum(w[feedback[i]] * f[i] for i in range(F))
+                    prob += pulp.lpSum(w[non_feedback[i]] * nf[i] for i in range(NF)) - pulp.lpSum(w[feedback[i]] * f[i] for i in range(F))
                 else:
-                    prob += sum(w[non_feedback[i]] * nf[i] for i in range(NF))
+                    prob += pulp.lpSum(w[non_feedback[i]] * nf[i] for i in range(NF))
 
         if unique:
-            prob += sum(w[non_feedback[i]] * nf[i] for i in range(NF)) - sum(w[feedback[i]] * f[i] for i in range(F)) + \
-                    10e-6 * sum(f[tokens[k]] * t[k] for k in range(T))
+            prob += pulp.lpSum(w[non_feedback[i]] * nf[i] for i in range(NF)) - pulp.lpSum(w[feedback[i]] * f[i] for i in range(F)) + \
+                    10e-6 * pulp.lpSum(f[tokens[k]] * t[k] for k in range(T))
 
         # CONSTRAINT FOR SUMMARY SIZE
-        prob += sum(s[j] * self.summarizer.sentences[j].length for j in range(S)) <= L
+        prob += pulp.lpSum(s[j] * self.summarizer.sentences[j].length for j in range(S)) <= L
 
         # INTEGRITY CONSTRAINTS
         for i in range(NF):
@@ -475,7 +479,7 @@ def solve_joint_ilp(self, summary_size, feedback, non_feedback, uncertainity={},
                     prob += s[j] <= nf[i]
 
         for i in range(NF):
-            prob += sum(s[j] for j in range(S)
+            prob += pulp.lpSum(s[j] for j in range(S)
                         if non_feedback[i] in self.summarizer.sentences[j].concepts) >= nf[i]
 
         for i in range(F):
@@ -484,7 +488,7 @@ def solve_joint_ilp(self, summary_size, feedback, non_feedback, uncertainity={},
                     prob += s[j] <= f[i]
 
         for i in range(F):
-            prob += sum(s[j] for j in range(S)
+            prob += pulp.lpSum(s[j] for j in range(S)
                         if feedback[i] in self.summarizer.sentences[j].concepts) >= f[i]
 
         # WORD INTEGRITY CONSTRAINTS
@@ -494,24 +498,27 @@ def solve_joint_ilp(self, summary_size, feedback, non_feedback, uncertainity={},
                     prob += s[j] <= t[k]
 
             for k in range(T):
-                prob += sum(s[j] for j in self.summarizer.w2s[tokens[k]]) >= t[k]
+                prob += pulp.lpSum(s[j] for j in self.summarizer.w2s[tokens[k]]) >= t[k]
 
         # CONSTRAINTS FOR FINDING OPTIMAL SOLUTIONS
         for sentence_set in excluded_solutions:
-            prob += sum([s[j] for j in sentence_set]) <= len(sentence_set) - 1
+            prob += pulp.lpSum([s[j] for j in sentence_set]) <= len(sentence_set) - 1
 
         # prob.writeLP('test.lp')
 
         # solving the ilp problem
-        if solver == 'gurobi':
-            prob.solve(pulp.GUROBI(msg=0))
-        elif solver == 'glpk':
-            prob.solve(pulp.GLPK(msg=0))
-        elif solver == 'cplex':
+        try:
+            print('Solving using CPLEX')
             prob.solve(pulp.CPLEX(msg=0))
-        else:
-            sys.exit('no solver specified')
-
+        except:
+            print('Fallback to mentioned solver')
+            if solver == 'gurobi':
+                prob.solve(pulp.GUROBI(msg=0))
+            elif solver == 'glpk':
+                prob.solve(pulp.GLPK(msg=0)) 
+            else:
+                sys.exit('no solver specified')
+
         # retreive the optimal subset of sentences
         solution = set([j for j in range(S) if s[j].varValue == 1])
 

diff --git a/summarizer/algorithms/upper_bound_ilp.py b/summarizer/algorithms/upper_bound_ilp.py
@@ -154,17 +154,23 @@ def solve_ilp(self, N):
 
         # Define ILP problem, maximum coverage of grams from the reference summaries
         prob = pulp.LpProblem("ExtractiveUpperBound", pulp.LpMaximize)
-        prob += sum(z[j] for j in self.ref_ngrams_idx)
+        prob += pulp.lpSum(z[j] for j in self.ref_ngrams_idx)
 
         # Define ILP constraints, length constraint and consistency constraint (impose that z_j is 1 if j
         # appears in the created summary)
-        prob += sum(x[i] * self.sentences[i].length for i in self.sentences_idx) <= self.sum_length
+        prob += pulp.lpSum(x[i] * self.sentences[i].length for i in self.sentences_idx) <= self.sum_length
 
         for j in self.ref_ngrams_idx:
-            prob += sum(A[i][j] * x[i] for i in self.sentences_idx) >= z[j]
+            prob += pulp.lpSum(A[i][j] * x[i] for i in self.sentences_idx) >= z[j]
 
         # Solve ILP problem and post-processing to get the summary
-        prob.solve(pulp.GLPK(msg=0))
+        try:
+            print('Solving using CPLEX')
+            prob.solve(pulp.CPLEX(msg=0))
+        except:
+            print('Solving using GLPK')
+            prob.solve(pulp.GLPK(msg=0))
+
 
         summary_idx = []
         for idx in self.sentences_idx:

diff --git a/summarizer/baselines/sume/models/concept_based.py b/summarizer/baselines/sume/models/concept_based.py
@@ -591,17 +591,17 @@ def solve_ilp_problem(self,
                                   cat='Integer')
 
         # OBJECTIVE FUNCTION
-        prob += sum(w[concepts[i]] * c[i] for i in range(C))
+        prob += pulp.lpSum(w[concepts[i]] * c[i] for i in range(C))
 
         if unique:
-            prob += sum(w[concepts[i]] * c[i] for i in range(C)) + \
-                    10e-6 * sum(f[tokens[k]] * t[k] for k in range(T))
+            prob += pulp.lpSum(w[concepts[i]] * c[i] for i in range(C)) + \
+                    10e-6 * pulp.lpSum(f[tokens[k]] * t[k] for k in range(T))
 
         # CONSTRAINT FOR SUMMARY SIZE
         if units == "WORDS":
-            prob += sum(s[j] * self.sentences[j].length for j in range(S)) <= L
+            prob += pulp.lpSum(s[j] * self.sentences[j].length for j in range(S)) <= L
         if units == "CHARACTERS":
-            prob += sum(s[j] * len(self.sentences[j].untokenized_form) for j in range(S)) <= L
+            prob += pulp.lpSum(s[j] * len(self.sentences[j].untokenized_form) for j in range(S)) <= L
 
 
         # INTEGRITY CONSTRAINTS
@@ -611,7 +611,7 @@ def solve_ilp_problem(self,
                     prob += s[j] <= c[i]
 
         for i in range(C):
-            prob += sum(s[j] for j in range(S)
+            prob += pulp.lpSum(s[j] for j in range(S)
                         if concepts[i] in self.sentences[j].concepts) >= c[i]
 
         # WORD INTEGRITY CONSTRAINTS
@@ -621,23 +621,26 @@ def solve_ilp_problem(self,
                     prob += s[j] <= t[k]
 
             for k in range(T):
-                prob += sum(s[j] for j in self.w2s[tokens[k]]) >= t[k]
+                prob += pulp.lpSum(s[j] for j in self.w2s[tokens[k]]) >= t[k]
 
         # CONSTRAINTS FOR FINDING OPTIMAL SOLUTIONS
         for sentence_set in excluded_solutions:
-            prob += sum([s[j] for j in sentence_set]) <= len(sentence_set)-1
+            prob += pulp.lpSum([s[j] for j in sentence_set]) <= len(sentence_set)-1
 
         # prob.writeLP('test.lp')
 
         # solving the ilp problem
-        if solver == 'gurobi':
-            prob.solve(pulp.GUROBI(msg=0))
-        elif solver == 'glpk':
-            prob.solve(pulp.GLPK(msg=0))
-        elif solver == 'cplex':
-            prob.solve(pulp.CPLEX(msg=0))
-        else:
-            sys.exit('no solver specified')
+        try:
+            print('Solving using Cplex')
+            prob.solve(pulp.CPLEX(msg=0)) 
+        except:
+            print('Fallback to mentioned solver')
+            if solver == 'gurobi':
+                prob.solve(pulp.GUROBI(msg=0))
+            elif solver == 'glpk':
+                prob.solve(pulp.GLPK(msg=0)) 
+            else:
+                sys.exit('no solver specified')
 
         # retreive the optimal subset of sentences
         solution = set([j for j in range(S) if s[j].varValue == 1])