Merge pull request #108 from CDU-data-science-team/107_v6_framework

107 v6 framework
The-Strategy-Unit · Jun 12, 2023 · e078e12 · e078e12
2 parents 871252a + eb7dcf3
commit e078e12
Show file tree

Hide file tree

Showing 5 changed files with 158 additions and 902 deletions.
diff --git a/api/api.py b/api/api.py
@@ -10,6 +10,60 @@
     predict_multilabel_sklearn,
 )
 
+minor_cats_v5 = [
+    "Gratitude/ good experience",
+    "Negative experience",
+    "Not assigned",
+    "Organisation & efficiency",
+    "Funding & use of financial resources",
+    "Non-specific praise for staff",
+    "Non-specific dissatisfaction with staff",
+    "Staff manner & personal attributes",
+    "Number & deployment of staff",
+    "Staff responsiveness",
+    "Staff continuity",
+    "Competence & training",
+    "Unspecified communication",
+    "Staff listening, understanding & involving patients",
+    "Information directly from staff during care",
+    "Information provision & guidance",
+    "Being kept informed, clarity & consistency of information",
+    "Service involvement with family/ carers",
+    "Patient contact with family/ carers",
+    "Contacting services",
+    "Appointment arrangements",
+    "Appointment method",
+    "Timeliness of care",
+    "Pain management",
+    "Diagnosis & triage",
+    "Referals & continuity of care",
+    "Length of stay/ duration of care",
+    "Discharge",
+    "Care plans",
+    "Patient records",
+    "Links with non-NHS organisations",
+    "Cleanliness, tidiness & infection control",
+    "Safety & security",
+    "Provision of medical equipment",
+    "Service location",
+    "Transport to/ from services",
+    "Parking",
+    "Electronic entertainment",
+    "Feeling safe",
+    "Patient appearance & grooming",
+    "Mental Health Act",
+    "Equality, Diversity & Inclusion",
+    "Admission",
+    "Collecting patients feedback",
+    "Labelling not possible",
+    "Environment & Facilities",
+    "Supplying & understanding medication",
+    "Activities & access to fresh air",
+    "Food & drink provision & facilities",
+    "Sensory experience",
+    "Impact of treatment/ care",
+]
+
 description = """
 This API is for classifying patient experience qualitative data,
 utilising the models trained as part of the pxtextmining project.
@@ -28,11 +82,8 @@ class Test(BaseModel):
     test: str
 
     class Config:
-        schema_extra = {
-            "example": {
-                "test": "Hello"
-            }
-        }
+        schema_extra = {"example": {"test": "Hello"}}
+
 
 class ItemIn(BaseModel):
     comment_id: str
@@ -85,16 +136,16 @@ class Config:
         "name": "MIT License",
         "url": "https://github.com/CDU-data-science-team/pxtextmining/blob/main/LICENSE",
     },
-    openapi_tags=tags_metadata
+    openapi_tags=tags_metadata,
 )
 
 
-@app.get("/", response_model=Test, tags=['index'])
+@app.get("/", response_model=Test, tags=["index"])
 def index():
     return {"test": "Hello"}
 
 
-@app.post("/predict_multilabel", response_model=List[ItemOut], tags=['predict'])
+@app.post("/predict_multilabel", response_model=List[ItemOut], tags=["predict"])
 def predict(items: List[ItemIn]):
     """Accepts comment ids, comment text and question type as JSON in a POST request. Makes predictions using trained SVC model.
 
@@ -128,7 +179,7 @@ def predict(items: List[ItemIn]):
     with open(model_path, "rb") as model:
         loaded_model = pickle.load(model)
     preds_df = predict_multilabel_sklearn(
-        text_to_predict, loaded_model, additional_features=True
+        text_to_predict, loaded_model, labels=minor_cats_v5, additional_features=True
     )
     # Join predicted labels with received data
     preds_df["comment_id"] = preds_df.index.astype(str)

diff --git a/pxtextmining/factories/factory_data_load_and_split.py b/pxtextmining/factories/factory_data_load_and_split.py
@@ -120,7 +120,6 @@ def load_multilabel_data(filename, target="major_categories"):
         cols = ["Comment sentiment"]
     # Sort out the features first
     features_df = raw_data.loc[:, features].copy()
-    features_df = clean_empty_features(features_df)
     # Standardize FFT qs
     features_df['FFT question'] = features_df['FFT question'].fillna('nonspecific')
     features_df.loc[:, "FFT_q_standardised"] = (
@@ -135,6 +134,7 @@ def load_multilabel_data(filename, target="major_categories"):
     features_df.loc[:, "text_length"] = features_df.loc[:, "FFT answer"].apply(
         lambda x: len([word for word in str(x).split(" ") if word != ""])
     )
+    features_df = clean_empty_features(features_df)
     # Sort out the targets
     targets_df = raw_data.loc[:, cols].copy()
     targets_df = targets_df.replace("1", 1)
@@ -206,7 +206,11 @@ def process_data(df, target, preprocess_text=True, additional_features=False):
         X = clean_empty_features(X)
         print(f"After preprocessing, shape of X is {X.shape}")
     if preprocess_text == False:
-        X = df["FFT answer"].astype(str)
+        X_temp = df["FFT answer"].astype(str).apply(remove_punc_and_nums)
+        X_temp = clean_empty_features(X_temp)
+        print(f"After preprocessing, shape of X is {X_temp.shape}")
+        indices = X_temp.index
+        X = df["FFT answer"].astype(str).filter(indices)
     if additional_features == True:
         X = pd.merge(X, df[["FFT_q_standardised"]], left_index=True, right_index=True)
         X = X.reset_index()

diff --git a/pxtextmining/params.py b/pxtextmining/params.py
@@ -1,38 +1,36 @@
-dataset = "datasets/hidden/merged_20230602.csv"
+dataset = "datasets/hidden/merged_230608.csv"
 
-random_state = 99
+random_state = 42
 
 model_name = "distilbert-base-uncased"
 
 q_map = {
-        "Please tell us why": "nonspecific",
-        "Please tells us why you gave this answer?": "nonspecific",
-        "FFT Why?": "nonspecific",
-        "What was good?": "what_good",
-        "Is there anything we could have done better?": "could_improve",
-        "How could we improve?": "could_improve",
-        "What could we do better?": "could_improve",
-        "Please can you tell us why you gave your answer and what we could have done better?": "nonspecific",
-        "Please describe any things about the 111 service that\r\nyou were particularly satisfied and/or dissatisfied with": "nonspecific",
-        "Please describe any things about the 111 service that \nyou were particularly satisfied and/or dissatisfied with": "nonspecific",
-        "Please describe any things about the 111 service that\nyou were particularly satisfied and/or dissatisfied with": 'nonspecific',
-        "Nonspecific": 'nonspecific',
-        "nonspecific": 'nonspecific'
-    }
+    "Please tell us why": "nonspecific",
+    "Please tells us why you gave this answer?": "nonspecific",
+    "FFT Why?": "nonspecific",
+    "What was good?": "what_good",
+    "Is there anything we could have done better?": "could_improve",
+    "How could we improve?": "could_improve",
+    "What could we do better?": "could_improve",
+    "Please can you tell us why you gave your answer and what we could have done better?": "nonspecific",
+    "Please describe any things about the 111 service that\r\nyou were particularly satisfied and/or dissatisfied with": "nonspecific",
+    "Please describe any things about the 111 service that \nyou were particularly satisfied and/or dissatisfied with": "nonspecific",
+    "Please describe any things about the 111 service that\nyou were particularly satisfied and/or dissatisfied with": "nonspecific",
+    "Nonspecific": "nonspecific",
+    "nonspecific": "nonspecific",
+}
 
+# v6
 major_cat_dict = {
     "General": [
         "Labelling not possible",
-        "Gratitude/ good experience",
-        "Negative experience",
+        "Positive experience & gratitude",
+        "Negative experience & dissatisfaction",
         "Not assigned",
         "Organisation & efficiency",
         "Funding & use of financial resources",
-        "Collecting patients feedback",
     ],
     "Staff": [
-        "Non-specific praise for staff",
-        "Non-specific dissatisfaction with staff",
         "Staff manner & personal attributes",
         "Number & deployment of staff",
         "Staff responsiveness",
@@ -45,8 +43,7 @@
         "Information directly from staff during care",
         "Information provision & guidance",
         "Being kept informed, clarity & consistency of information",
-        "Service involvement with family/ carers",
-        "Patient contact with family/ carers",
+        "Interaction with family/ carers",
     ],
     "Access to medical care & support": [
         "Contacting services",
@@ -59,12 +56,10 @@
         "Diagnosis & triage",
         "Referals & continuity of care",
         "Admission",
-        "Length of stay/ duration of care",
         "Discharge",
         "Care plans",
         "Patient records",
         "Impact of treatment/ care",
-        "Links with non-NHS organisations",
     ],
     "Food & diet": ["Food & drink provision & facilities"],
     "Category TBC": [
@@ -77,7 +72,6 @@
         "Cleanliness, tidiness & infection control",
         "Sensory experience",
         "Environment & Facilities",
-        "Safety & security",
         "Provision of medical equipment",
     ],
     "Mental Health specifics": ["Mental Health Act"],
@@ -93,12 +87,12 @@
 # v6 20230602
 merged_minor_cats = [
     "Gratitude/ good experience",
-#     "Negative experience",
+    #     "Negative experience",
     "Not assigned",
     "Organisation & efficiency",
-#     "Funding & use of financial resources",
+    #     "Funding & use of financial resources",
     "Non-specific praise for staff",
-#     "Non-specific dissatisfaction with staff",
+    #     "Non-specific dissatisfaction with staff",
     "Staff manner & personal attributes",
     "Number & deployment of staff",
     "Staff responsiveness",
@@ -109,23 +103,23 @@
     "Information directly from staff during care",
     "Information provision & guidance",
     "Being kept informed, clarity & consistency of information",
-#     "Service involvement with family/ carers",
-#     "Patient contact with family/ carers",
+    #     "Service involvement with family/ carers",
+    #     "Patient contact with family/ carers",
     "Contacting services",
     "Appointment arrangements",
     "Appointment method",
     "Timeliness of care",
     "Pain management",
     "Diagnosis & triage",
     "Referals & continuity of care",
-#     "Length of stay/ duration of care",
+    #     "Length of stay/ duration of care",
     "Discharge",
     "Care plans",
-#     "Patient records",
-#     "Links with non-NHS organisations",
+    #     "Patient records",
+    #     "Links with non-NHS organisations",
     "Cleanliness, tidiness & infection control",
     "Safety & security",
-#     "Provision of medical equipment",
+    #     "Provision of medical equipment",
     "Service location",
     "Transport to/ from services",
     "Parking",
@@ -135,7 +129,7 @@
     "Mental Health Act",
     "Equality, Diversity & Inclusion",
     "Admission",
-#     "Collecting patients feedback",
+    #     "Collecting patients feedback",
     "Labelling not possible",
     "Environment & Facilities",
     "Supplying & understanding medication",
@@ -144,18 +138,14 @@
     "Sensory experience",
     "Impact of treatment/ care",
     "Negative experience/ dissatisfaction",
-    "Family/ carers"
+    "Family/ carers",
 ]
 
-# v5 20230419
+# v6 20230806
 minor_cats = [
-    "Gratitude/ good experience",
-    "Negative experience",
     "Not assigned",
     "Organisation & efficiency",
     "Funding & use of financial resources",
-    "Non-specific praise for staff",
-    "Non-specific dissatisfaction with staff",
     "Staff manner & personal attributes",
     "Number & deployment of staff",
     "Staff responsiveness",
@@ -166,22 +156,17 @@
     "Information directly from staff during care",
     "Information provision & guidance",
     "Being kept informed, clarity & consistency of information",
-    "Service involvement with family/ carers",
-    "Patient contact with family/ carers",
     "Contacting services",
     "Appointment arrangements",
     "Appointment method",
     "Timeliness of care",
     "Pain management",
     "Diagnosis & triage",
     "Referals & continuity of care",
-    "Length of stay/ duration of care",
     "Discharge",
     "Care plans",
     "Patient records",
-    "Links with non-NHS organisations",
     "Cleanliness, tidiness & infection control",
-    "Safety & security",
     "Provision of medical equipment",
     "Service location",
     "Transport to/ from services",
@@ -192,14 +177,21 @@
     "Mental Health Act",
     "Equality, Diversity & Inclusion",
     "Admission",
-    "Collecting patients feedback",
     "Labelling not possible",
     "Environment & Facilities",
     "Supplying & understanding medication",
     "Activities & access to fresh air",
     "Food & drink provision & facilities",
     "Sensory experience",
     "Impact of treatment/ care",
+    # "Psychological therapy arrangements",
+    # "Existence of services",
+    # "Choice of services",
+    # "Out of hours support (community services)",
+    # "Learning organisation",
+    "Interaction with family/ carers",
+    "Negative experience & dissatisfaction",
+    "Positive experience & gratitude",
 ]
 
 sentiment_dict = {

diff --git a/pxtextmining/pipelines/multilabel_pipeline.py b/pxtextmining/pipelines/multilabel_pipeline.py
@@ -357,15 +357,16 @@ def run_two_layer_sklearn_pipeline(
 
 
 if __name__ == "__main__":
-    run_bert_pipeline(additional_features = True, path = 'test_multilabel/merged_cats_0607', target = merged_minor_cats)
-    run_sklearn_pipeline(additional_features = True, target= merged_minor_cats, models_to_try = ["xgb"], path = 'test_multilabel/merged_cats_0607/xgb',
-                         include_analysis=True)
-    run_svc_pipeline(
-        additional_features=True,
-        target=merged_minor_cats,
-        path="test_multilabel/merged_cats_0607/svc",
-        include_analysis=True
-    )
-    run_sklearn_pipeline(additional_features = True, target= merged_minor_cats, models_to_try = ["svm"], path = 'test_multilabel/merged_cats_0607/svc_2',
-                         include_analysis=True)
+
+    # run_sklearn_pipeline(additional_features = True, target= minor_cats, models_to_try = ["xgb"], path = 'test_multilabel/v6_230806/xgb',
+    #                      include_analysis=True)
+    # run_svc_pipeline(
+    #     additional_features=True,
+    #     target=minor_cats,
+    #     path="test_multilabel/v6_230806/svc",
+    #     include_analysis=True
+    # )
+    run_bert_pipeline(additional_features = True, path = 'test_multilabel/v6_230806', target = minor_cats, include_analysis=True)
+    # run_sklearn_pipeline(additional_features = True, target= minor_cats, models_to_try = ["svm"], path = 'test_multilabel/v6_230806/svc_2',
+    #                      include_analysis=True)
     # run_two_layer_sklearn_pipeline()