In [2]:
!pip install radon
!pip install transformers
!pip install git+https://github.com/ericsuh/dirichlet.git
!pip install gdown
!gdown 1HGcDvwBsV8jid5URzEuG-2xywFQTERpe
!gdown 1qKmRb1CAButi3gePN6BO8jPa6TiATVKR

Collecting git+https://github.com/ericsuh/dirichlet.git
  Cloning https://github.com/ericsuh/dirichlet.git to /tmp/pip-req-build-jtqjlsf_
  Running command git clone --filter=blob:none --quiet https://github.com/ericsuh/dirichlet.git /tmp/pip-req-build-jtqjlsf_
  Resolved https://github.com/ericsuh/dirichlet.git to commit 8e832cc55ced9150e30ea3a7402f594896c5a527
  Preparing metadata (setup.py) ... [?25ldone
Downloading...
From: https://drive.google.com/uc?id=1HGcDvwBsV8jid5URzEuG-2xywFQTERpe
To: /kaggle/working/ranknet_estimator.pt
100%|████████████████████████████████████████| 600k/600k [00:00<00:00, 1.30MB/s]
Downloading...
From: https://drive.google.com/uc?id=1qKmRb1CAButi3gePN6BO8jPa6TiATVKR
To: /kaggle/working/standard_scaler
100%|██████████████████████████████████████████| 973/973 [00:00<00:00, 5.69MB/s]


In [3]:
import math
import numpy as np
import pandas as pd
import re
from radon.raw import analyze
from radon.complexity import cc_visit
from radon.metrics import h_visit
from radon.metrics import mi_parameters
from html.parser import HTMLParser
import dirichlet
import torch
import torch.nn as nn
import torch.nn.functional as f
from transformers import AutoTokenizer, AutoModelForCausalLM
from joblib import load



In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [6]:
class RankNet(nn.Module):
    def __init__(self, num_features):
        super(RankNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(num_features, 512),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 1)
        )

    def forward(self, input):
        return self.model.forward(input)

class PerceptionEstimator():
    class CodeMetricsEvaluator():
        class SnippetsGetter(HTMLParser):
            def __init__(self):
                self._s = []
                self._code_flag = False
                super().__init__()

            def refresh_status(self):
                self._s = []
                self._code_flag = False

            def get_status(self):
                return self._s

            def handle_starttag(self, tag, attrs):
                if tag == 'code':
                    self._code_flag = True

            def handle_endtag(self, tag):
                if tag == 'code':
                    self._code_flag = False

            def handle_data(self, data):
                if self._code_flag:
                    self._s.append(data)

        def __init__(self):
            self._snippets_getter = self.SnippetsGetter()

        @staticmethod
        def _get_code_statistics(code_snippet: str):
            res = {}

            basic_metrics = analyze(code_snippet)._asdict()
            res["CodeLines"] = basic_metrics["loc"]
            res["LogicalCodeLines"] = basic_metrics["lloc"]
            res["SourceCodeLines"] = basic_metrics["sloc"]
            res["Multilines"] = basic_metrics["multi"]
            res["NumberOfFunctions"] = len(cc_visit(code_snippet))

            combined_metrics = h_visit(code_snippet).total._asdict()
            res["DistinctOperators"] = combined_metrics["h1"]
            res["DistinctOperands"] = combined_metrics["h2"]
            res["TotalOperators"] = combined_metrics["N1"]
            res["TotalOperands"] = combined_metrics["N2"]
            res["Vocabulary"] = combined_metrics["vocabulary"] # h1 + h2
            res["LogicalLength"] = combined_metrics["length"] # N1 + N2
            res["CalculatedLength"] = (
                                        combined_metrics["h1"] * np.log2(np.clip(combined_metrics["h1"], 1, np.inf))
                                        + combined_metrics["h2"] * np.log2(np.clip(combined_metrics["h2"], 1, np.inf))
                                      )
            res["Volume"] = combined_metrics["length"] * np.log2(np.clip(combined_metrics["vocabulary"], 1, np.inf)) # N * log2(h)
            res["Difficulty"] = combined_metrics["h1"] * combined_metrics["N2"] / (2 * np.clip(combined_metrics["h2"], 1, np.inf)) # h1 / 2 * N2 / h2
            res["Effort"] = res["Difficulty"] * res["Volume"] # Difficulty * Volume

            mi_params = mi_parameters(code_snippet)
            res["CyclomaticComplexity"] = mi_params[1]
            res["MaintainabilityIndex"] = (
                                            171
                                            - 5.2 * np.log(np.clip(mi_params[0], 1, np.inf))
                                            - 0.23 * mi_params[1]
                                            - 16.2 * np.log(np.clip(mi_params[2], 1, np.inf))
                                            + 50 * np.sin(np.sqrt(2.46 * np.clip(mi_params[3], 0, np.inf)))
                                        ) / 171

            return res

        def evaluate_code_metrics(self, answer: str) -> dict:
            code_statistics = {
                'CodeLines': [],
                'LogicalCodeLines': 0,
                'SourceCodeLines': 0,
                'Multilines': 0,
                'NumberOfFunctions': 0,
                'CyclomaticComplexity': 0,
                'DistinctOperators': 0,
                'DistinctOperands': 0,
                'TotalOperators': 0,
                'TotalOperands': 0,
                'Vocabulary': 0,
                'LogicalLength': 0,
                'CalculatedLength': 0,
                'Volume': 0,
                'Difficulty': 0,
                'Effort': 0,
                'MaintainabilityIndex': []
            }

            cnt = 0
            try:
                self._snippets_getter.refresh_status()
                self._snippets_getter.feed(answer)
                snippets = [
                    "\n".join([
                            code_line if re.compile("print [^(]").search(code_line) is None
                                    else re.sub("print[^(].*", "print(" + code_line.split("print")[1] + ")", code_line)
                            for code_line in code_snippet.split('\n')
                        ])
                    for code_snippet in self._snippets_getter.get_status()
                ]
                for code_snippet in snippets:
                    try:
                        snippet_statistics = self._get_code_statistics(code_snippet)
                        for field in code_statistics.keys():
                            if field in ('CodeLines', 'MaintainabilityIndex'):
                                code_statistics[field].append(snippet_statistics[field])
                            else:
                                code_statistics[field] += snippet_statistics[field]
                        cnt += 1
                    except: # interactive-type code
                        try:
                            code_snippet = "\n".join([
                                                re.sub(">>> |\.\.\. ", "", code_line)
                                                for code_line in code_snippet.split('\n')
                                                if re.search("^[>>>|...]", code_line)
                                            ])
                            snippet_statistics = self._get_code_statistics(code_snippet)
                            for field in code_statistics.keys():
                                if field in ('CodeLines', 'MaintainabilityIndex'):
                                    code_statistics[field].append(snippet_statistics[field])
                                else:
                                    code_statistics[field] += snippet_statistics[field]
                            cnt += 1
                        except:
                            pass
            except Exception as err:
                print(f"Error: an error was received while parsing code fragments ({err})")

            code_statistics['NumberOfSnippets'] = cnt
            if np.sum(code_statistics['CodeLines']) == 0:
                print("Warning: code fragments have not been parsed. The response may not contain code or may have syntax errors")
                code_statistics['TotalCodeLines'] = 0
                code_statistics['AvgSnippetCodeLines'] = 0
                code_statistics['MaintainabilityIndexCodeLinesAvg'] = 0
                code_statistics['MaintainabilityIndexUniformAvg'] = 0
            else:
                code_statistics['TotalCodeLines'] = np.sum(code_statistics['CodeLines'])
                code_statistics['AvgSnippetCodeLines'] = np.mean(code_statistics['CodeLines'])
                code_statistics['MaintainabilityIndexCodeLinesAvg'] = np.multiply(
                                                                                code_statistics['MaintainabilityIndex'],
                                                                                code_statistics['CodeLines']
                                                                            ).sum() / np.sum(code_statistics['CodeLines'])
                code_statistics['MaintainabilityIndexUniformAvg'] = np.mean(code_statistics['MaintainabilityIndex'])

            code_statistics.pop('CodeLines')
            code_statistics.pop('MaintainabilityIndex')
            return code_statistics

    class LLMBasedRelevanceEstimator():
        class HTMLtoPromptsConverter(HTMLParser):
            def __init__(self):
                self._s = ""
                super().__init__()

            def refresh_status(self):
                self._s = ""

            def get_status(self):
                return self._s

            def handle_data(self, data):
                self._s += data.rstrip('\n') + ('\n' if data[-1] == '\n' else '')

        def __init__(self, device):
            self._device = device
            self._model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B",
                                                                output_hidden_states=True,
                                                                trust_remote_code=True,
                                                                revision="main").to(device)
            self._converter = self.HTMLtoPromptsConverter()
            self._tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B",
                                                            trust_remote_code=True)
            
        @staticmethod
        def _convert_spherical(input):
            r2 = (input**2).sum(dim=1)
            angles = torch.acos(torch.clip(
                                    input[:, :-1] / torch.flip(torch.sqrt(
                                        torch.cumsum(
                                            torch.flip(input**2, dims=(1,)),
                                            dim=1)[:, 1:]
                                    ), dims=(1,)), min=-1, max=1))
            angles[input[:, -1] < 0] = 2 * np.pi - angles[input[:, -1] < 0]
            return torch.concat((torch.sqrt(r2).unsqueeze(dim=1), angles), dim=1)

        @staticmethod
        def _convert_cartesian(input):
            a = torch.concat((torch.tensor([2*np.pi] * input.size()[0]).unsqueeze(dim=1), input[:, 1:]), dim=1)
            sin = torch.sin(a)
            sin[:, 0] = 1
            sin = torch.cumprod(sin, dim=1)
            cos = torch.cos(a)
            cos = torch.roll(cos, -1)
            return sin*cos*input[:, 0].unsqueeze(dim=1)

        @staticmethod
        def _get_dirichlet_distribution_features(q_emb, a_emb, prefix, normalization_function):
            """
            normalization_function: torch.tensor of size (N, M) -> torch.tensor of size (N, M)
            transform distribution from R^n to [-1, 1]^n
            """
            dirichlet_features = {}
            q_dirichlet_params = torch.as_tensor(dirichlet.mle((normalization_function(q_emb).squeeze().numpy() + 1) / 2))
            a_dirichlet_params = torch.as_tensor(dirichlet.mle((normalization_function(a_emb).squeeze().numpy() + 1) / 2))
            dirichlet_features[prefix + "DirichletParamsL1Dist"] = torch.abs(q_dirichlet_params - a_dirichlet_params).sum().item()
            dirichlet_features[prefix + "DirichletParamsL2Dist"] = ((q_dirichlet_params - a_dirichlet_params)**2).sum().item()
            dirichlet_features[prefix + "DirichletQAKLDiveregence"] = torch.distributions.kl.kl_divergence(
                                                                          torch.distributions.dirichlet.Dirichlet(q_dirichlet_params),
                                                                          torch.distributions.dirichlet.Dirichlet(a_dirichlet_params)
                                                                      ).item()
            dirichlet_features[prefix + "DirichletAQKLDiveregence"] = torch.distributions.kl.kl_divergence(
                                                                          torch.distributions.dirichlet.Dirichlet(a_dirichlet_params),
                                                                          torch.distributions.dirichlet.Dirichlet(q_dirichlet_params)
                                                                      ).item()
            return dirichlet_features

        @staticmethod
        def _get_norm_diag_distribution_features(q_emb, a_emb):
            gaussian_features = {}
            q_mean = q_emb.mean(dim=0)
            q_dimwise_var = q_emb.var(dim=0, correction=1)
            a_mean = a_emb.mean(dim=0)
            a_dimwise_var = a_emb.var(dim=0, correction=1)
            gaussian_features["DiagNormQAKLDiveregence"] = 0.5 * (
                                                                ((a_mean - q_mean) * torch.pow(a_dimwise_var, -1) * (a_mean - q_mean)).sum().item()
                                                                + (torch.pow(a_dimwise_var, -1) * q_dimwise_var).sum().item()
                                                                + torch.log(a_dimwise_var).sum().item()
                                                                - torch.log(q_dimwise_var).sum().item()
                                                                - q_emb.size(dim=1)
                                                            )
            gaussian_features["DiagNormAQKLDiveregence"] = 0.5 * (
                                                                ((q_mean - a_mean) * torch.pow(q_dimwise_var, -1) * (q_mean - a_mean)).sum().item()
                                                                +(torch.pow(q_dimwise_var, -1) * a_dimwise_var).sum().item()
                                                                + torch.log(q_dimwise_var).sum().item()
                                                                - torch.log(a_dimwise_var).sum().item()
                                                                - q_emb.size(dim=1)
                                                            )
            return gaussian_features

        def _get_sample_features(self, q_emb, a_emb):
            """
            get_sample_features return a few relevance metrics
            :q_emb: torch.tensor of size (N, M)
            :a_emb: torch.tensor of size (N, M)
            :return: dictionary of metrics
            """
            sample_features = {}
            sample_features["AvgCosineSimilarity"] = f.cosine_similarity(
                                                                torch.unsqueeze(q_emb.mean(dim=0), 0),
                                                                torch.unsqueeze(a_emb.mean(dim=0), 0),
                                                                dim=1,
                                                                eps=1e-8).item()
            sample_features["AvgL1NormCosineSimilarity"] = f.cosine_similarity(
                                                                    torch.unsqueeze(f.normalize(q_emb, p=1, dim=1).mean(dim=0), 0),
                                                                    torch.unsqueeze(f.normalize(a_emb, p=1, dim=1).mean(dim=0), 0),
                                                                    dim=1,
                                                                    eps=1e-8).item()
            sample_features["AvgL2NormCosineSimilarity"] = f.cosine_similarity(
                                                                    torch.unsqueeze(f.normalize(q_emb, p=2, dim=1).mean(dim=0), 0),
                                                                    torch.unsqueeze(f.normalize(a_emb, p=2, dim=1).mean(dim=0), 0),
                                                                    dim=1,
                                                                    eps=1e-8).item()

            q_spherical_coordinates = self._convert_spherical(q_emb).sum(dim=0).unsqueeze(dim=0)
            q_spherical_coordinates[:, 1:] = torch.fmod(q_spherical_coordinates[:, 1:], 2*np.pi) / q_emb.size(dim=0)
            q_spherical_mean = self._convert_cartesian(q_spherical_coordinates)
            a_spherical_coordinates = self._convert_spherical(a_emb).sum(dim=0).unsqueeze(dim=0)
            a_spherical_coordinates[:, 1:] = torch.fmod(a_spherical_coordinates[:, 1:], 2*np.pi) / a_emb.size(dim=0)
            a_spherical_mean = self._convert_cartesian(a_spherical_coordinates)
            sample_features["SphericalAvgL1Dist"] = torch.abs(q_spherical_mean - a_spherical_mean).sum().item()
            sample_features["SphericalAvgL2Dist"] = ((q_spherical_mean - a_spherical_mean)**2).sum().item()
            sample_features["SphericalAvgCosineSimilarity"] = f.cosine_similarity(
                                                                        q_spherical_mean,
                                                                        a_spherical_mean,
                                                                        dim=1,
                                                                        eps=1e-8).item()

            sample_features.update(self._get_dirichlet_distribution_features(
                                                                        q_emb,
                                                                        a_emb,
                                                                        "L2Norm",
                                                                        lambda x: f.normalize(x, p=2, dim=1)))
            sample_features.update(self._get_dirichlet_distribution_features(
                                                                        q_emb,
                                                                        a_emb,
                                                                        "ArctanNorm",
                                                                        lambda x: torch.atan(x) / (np.pi/2 + 1e-8)))

            sample_features.update(self._get_norm_diag_distribution_features(q_emb, a_emb))
            return sample_features

        def estimate_relevance(self, question: str, answer: str) -> dict:
            try:
                with torch.no_grad():
                    self._converter.refresh_status()
                    self._converter.feed(question)
                    q_ids = self._tokenizer(
                                        self._converter.get_status(),
                                        return_tensors="pt",
                                        max_length=2048,
                                        truncation=True).input_ids
                    self._converter.refresh_status()
                    self._converter.feed(answer)
                    a_ids = self._tokenizer(
                                        self._converter.get_status(),
                                        return_tensors="pt",
                                        max_length=2048,
                                        truncation=True).input_ids
            except Exception as err:
                print(f"Error: LLM tokenization error ({err})")

            self._model.eval()
            with torch.no_grad():
                try:
                    q_emb = self._model(q_ids.to(self._device))["hidden_states"][-1].squeeze().to("cpu")
                    a_emb = self._model(a_ids.to(self._device))["hidden_states"][-1].squeeze().to("cpu")
                    relevance_statistics = self._get_sample_features(q_emb, a_emb)
                except Exception as err:
                    print(f"Error: LLM evaluation error ({err})")
                    relevance_statistics = {
                        "AvgCosineSimilarity": 0.0,
                        "AvgL1NormCosineSimilarity": 0.0,
                        "AvgL2NormCosineSimilarity": 0.0,
                        "SphericalAvgL1Dist": 0.0,
                        "SphericalAvgL2Dist": 0.0,
                        "SphericalAvgCosineSimilarity": 0.0,
                        "L2NormDirichletParamsL1Dist": 0.0,
                        "L2NormDirichletParamsL2Dist": 0.0,
                        "L2NormDirichletQAKLDiveregence": 0.0,
                        "L2NormDirichletAQKLDiveregence": 0.0,
                        "ArctanNormDirichletParamsL1Dist": 0.0,
                        "ArctanNormDirichletParamsL2Dist": 0.0,
                        "ArctanNormDirichletQAKLDiveregence": 0.0,
                        "ArctanNormDirichletAQKLDiveregence": 0.0,
                        "DiagNormQAKLDiveregence": 0.0,
                        "DiagNormAQKLDiveregence": 0.0
                    }
            return relevance_statistics

    def __init__(self, device):
        self._relevance_estimator = self.LLMBasedRelevanceEstimator(device)
        self._metrics_evaluator = self.CodeMetricsEvaluator()
        self._device = device
        self._std_scaler = load('std_scaler')
        self._final_estimator = RankNet(34)
        self._final_estimator.load_state_dict(
                    torch.load('ranknet_estimator.pt', map_location=device))

    @staticmethod
    def _feature_preprocessing(features):
        eps=1e-3
        to_log = [
            'LogicalCodeLines',
            'SourceCodeLines',
            'CyclomaticComplexity',
            'NumberOfSnippets',
            'TotalCodeLines',
            'AvgSnippetCodeLines',
            'SphericalAvgL1Dist',
            'SphericalAvgL2Dist',
            'L2NormDirichletParamsL1Dist',
            'L2NormDirichletParamsL2Dist',
            'L2NormDirichletQAKLDiveregence',
            'L2NormDirichletAQKLDiveregence',
            'ArctanNormDirichletParamsL1Dist',
            'ArctanNormDirichletParamsL2Dist',
            'ArctanNormDirichletQAKLDiveregence',
            'ArctanNormDirichletAQKLDiveregence',
            'DiagNormQAKLDiveregence',
            'DiagNormAQKLDiveregence'
            ]
        to_inv_log = [
            'MaintainabilityIndexCodeLinesAvg',
            'MaintainabilityIndexUniformAvg',
            'AvgCosineSimilarity',
            'AvgL1NormCosineSimilarity',
            'AvgL2NormCosineSimilarity',
            ]
        to_id = [
            'MaintainabilityIndexCodeLinesAvg',
            'MaintainabilityIndexUniformAvg',
            'AvgCosineSimilarity',
            'AvgL1NormCosineSimilarity',
            'AvgL2NormCosineSimilarity',
            'SphericalAvgL1Dist',
            'L2NormDirichletParamsL1Dist',
            'L2NormDirichletQAKLDiveregence',
            'ArctanNormDirichletParamsL1Dist',
            'DiagNormQAKLDiveregence',
            'DiagNormAQKLDiveregence']
        return np.concatenate((
            np.log(np.clip(features[to_log].values, eps, np.inf)),
            np.log(1 - np.clip(features[to_inv_log].values, -np.inf, 1-eps)),
            features[to_id].values
        ), axis=(len(features.shape) > 1) + 0)

    def predict(self, question: str, answer: str) -> float:
        res = 0
        metrics = pd.DataFrame([{
                        **self._metrics_evaluator.evaluate_code_metrics(answer),
                        **self._relevance_estimator.estimate_relevance(question, answer)
                    }])
        input_vector = self._feature_preprocessing(metrics)
        try:
            self._final_estimator.eval()
            with torch.no_grad():
                res = torch.sigmoid(
                        self._final_estimator.forward(
                            torch.as_tensor(
                                np.clip(
                                    self._std_scaler.transform(
                                        self._feature_preprocessing(metrics)
                                        .astype(np.float32)
                                    ),
                                -3, 3)
                            )
                        )
                    ).item()
        except Exception as err:
            print(f"Error: {err}")
        return res

In [7]:
model = PerceptionEstimator(device=device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [44]:
question = """
I think what I want to do is a fairly common task but I've found no reference on the web.
I have text with punctuation, and I want a list of the words.

"Hey, you - what are you doing here!?"
should be

<code>['hey', 'you', 'what', 'are', 'you', 'doing', 'here']<\code>
But Python's <code>str.split()<\code> only works with one argument, so I have all words with the punctuation after I split with whitespace. Any ideas?
"""

In [69]:
ans_1 = """
<code>
re.split()
re.split(pattern, string[, maxsplit=0])
<\code>
Split string by the occurrences of pattern.
If capturing parentheses are used in pattern, then the text of all groups in the pattern are also returned as part of the resulting list. If maxsplit is nonzero, at most maxsplit splits occur, and the remainder of the string is returned as the final element of the list. (Incompatibility note: in the original Python 1.5 release, maxsplit was ignored. This has been fixed in later releases.)
<code>
>>> re.split('\W+', 'Words, words, words.')
['Words', 'words', 'words', '']
>>> re.split('(\W+)', 'Words, words, words.')
['Words', ', ', 'words', ', ', 'words', '.', '']
>>> re.split('\W+', 'Words, words, words.', 1)
['Words', 'words, words.']
<\code>
"""
print(model.predict(question, ans_1))

0.6481162905693054


In [61]:
ans_2 = """
A case where regular expressions are justified:
<code>
import re
DATA = "Hey, you - what are you doing here!?"
print(re.findall(r"[\w']+", DATA))
# Prints ['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
</code>
"""
print(model.predict(question, ans_2))

0.4903918206691742


In [76]:
ans_3 = """
Another quick way to do this without a regexp is to replace the characters first, as below:
<code>
>>> 'a;bcd,ef g'.replace(';',' ').replace(',',' ').split()
['a', 'bcd', 'ef', 'g']
</code>
"""
print(model.predict(question, ans_3))

0.5053433775901794


In [73]:
ans_4 = """
So many answers, yet I can't find any solution that does efficiently what the title of the questions literally asks for (splitting on multiple possible separators—instead, many answers split on anything that is not a word, which is different). So here is an answer to the question in the title, that relies on Python's standard and efficient re module:
<code>
>>> import re  # Will be splitting on: , <space> - ! ? :
>>> filter(None, re.split("[, \-!?:]+", "Hey, you - what are you doing here!?"))
['Hey', 'you', 'what', 'are', 'you', 'doing', 'here']
</code>
where:

the […] matches one of the separators listed inside,
the \- in the regular expression is here to prevent the special interpretation of - as a character range indicator (as in A-Z),
the + skips one or more delimiters (it could be omitted thanks to the <code>filter()</code>, but this would unnecessarily produce empty strings between matched single-character separators), and
<code>filter(None, …)</code> removes the empty strings possibly created by leading and trailing separators (since empty strings have a false boolean value).
This <code>re.split()</code> precisely "splits with multiple separators", as asked for in the question title.

This solution is furthermore immune to the problems with non-ASCII characters in words found in some other solutions (see the first comment to ghostdog74's answer).

The re module is much more efficient (in speed and concision) than doing Python loops and tests "by hand"!
"""
print(model.predict(question, ans_4))

0.7338582277297974


In [74]:
ans_5 = """
got same problem as @ooboo and find this topic @ghostdog74 inspired me, maybe someone finds my solution usefull
<code>
str1='adj:sg:nom:m1.m2.m3:pos'
splitat=':.'
''.join([ s if s not in splitat else ' ' for s in str1]).split()
</code>
input something in space place and split using same character if you dont want to split at spaces.
"""
print(model.predict(question, ans_5))

0.3530653715133667


In [66]:
ans_6 = """
<code>
def get_words(s):
    l = []
    w = ''
    for c in s.lower():
        if c in '-!?,. ':
            if w != '': 
                l.append(w)
            w = ''
        else:
            w = w + c
    if w != '': 
        l.append(w)
    return l
</code>
Here is the usage:
<code>
>>> s = "Hey, you - what are you doing here!?"
>>> print get_words(s)
['hey', 'you', 'what', 'are', 'you', 'doing', 'here']
</code>
"""
print(model.predict(question, ans_6))

0.11576487123966217
