diff --git a/api/api.py b/api/api.py index ff353a8..707fa7c 100644 --- a/api/api.py +++ b/api/api.py @@ -10,6 +10,60 @@ predict_multilabel_sklearn, ) +minor_cats_v5 = [ + "Gratitude/ good experience", + "Negative experience", + "Not assigned", + "Organisation & efficiency", + "Funding & use of financial resources", + "Non-specific praise for staff", + "Non-specific dissatisfaction with staff", + "Staff manner & personal attributes", + "Number & deployment of staff", + "Staff responsiveness", + "Staff continuity", + "Competence & training", + "Unspecified communication", + "Staff listening, understanding & involving patients", + "Information directly from staff during care", + "Information provision & guidance", + "Being kept informed, clarity & consistency of information", + "Service involvement with family/ carers", + "Patient contact with family/ carers", + "Contacting services", + "Appointment arrangements", + "Appointment method", + "Timeliness of care", + "Pain management", + "Diagnosis & triage", + "Referals & continuity of care", + "Length of stay/ duration of care", + "Discharge", + "Care plans", + "Patient records", + "Links with non-NHS organisations", + "Cleanliness, tidiness & infection control", + "Safety & security", + "Provision of medical equipment", + "Service location", + "Transport to/ from services", + "Parking", + "Electronic entertainment", + "Feeling safe", + "Patient appearance & grooming", + "Mental Health Act", + "Equality, Diversity & Inclusion", + "Admission", + "Collecting patients feedback", + "Labelling not possible", + "Environment & Facilities", + "Supplying & understanding medication", + "Activities & access to fresh air", + "Food & drink provision & facilities", + "Sensory experience", + "Impact of treatment/ care", +] + description = """ This API is for classifying patient experience qualitative data, utilising the models trained as part of the pxtextmining project. @@ -28,11 +82,8 @@ class Test(BaseModel): test: str class Config: - schema_extra = { - "example": { - "test": "Hello" - } - } + schema_extra = {"example": {"test": "Hello"}} + class ItemIn(BaseModel): comment_id: str @@ -85,16 +136,16 @@ class Config: "name": "MIT License", "url": "https://github.com/CDU-data-science-team/pxtextmining/blob/main/LICENSE", }, - openapi_tags=tags_metadata + openapi_tags=tags_metadata, ) -@app.get("/", response_model=Test, tags=['index']) +@app.get("/", response_model=Test, tags=["index"]) def index(): return {"test": "Hello"} -@app.post("/predict_multilabel", response_model=List[ItemOut], tags=['predict']) +@app.post("/predict_multilabel", response_model=List[ItemOut], tags=["predict"]) def predict(items: List[ItemIn]): """Accepts comment ids, comment text and question type as JSON in a POST request. Makes predictions using trained SVC model. @@ -128,7 +179,7 @@ def predict(items: List[ItemIn]): with open(model_path, "rb") as model: loaded_model = pickle.load(model) preds_df = predict_multilabel_sklearn( - text_to_predict, loaded_model, additional_features=True + text_to_predict, loaded_model, labels=minor_cats_v5, additional_features=True ) # Join predicted labels with received data preds_df["comment_id"] = preds_df.index.astype(str) diff --git a/pxtextmining/factories/factory_data_load_and_split.py b/pxtextmining/factories/factory_data_load_and_split.py index 1b0fa88..1ae663c 100644 --- a/pxtextmining/factories/factory_data_load_and_split.py +++ b/pxtextmining/factories/factory_data_load_and_split.py @@ -120,7 +120,6 @@ def load_multilabel_data(filename, target="major_categories"): cols = ["Comment sentiment"] # Sort out the features first features_df = raw_data.loc[:, features].copy() - features_df = clean_empty_features(features_df) # Standardize FFT qs features_df['FFT question'] = features_df['FFT question'].fillna('nonspecific') features_df.loc[:, "FFT_q_standardised"] = ( @@ -135,6 +134,7 @@ def load_multilabel_data(filename, target="major_categories"): features_df.loc[:, "text_length"] = features_df.loc[:, "FFT answer"].apply( lambda x: len([word for word in str(x).split(" ") if word != ""]) ) + features_df = clean_empty_features(features_df) # Sort out the targets targets_df = raw_data.loc[:, cols].copy() targets_df = targets_df.replace("1", 1) @@ -206,7 +206,11 @@ def process_data(df, target, preprocess_text=True, additional_features=False): X = clean_empty_features(X) print(f"After preprocessing, shape of X is {X.shape}") if preprocess_text == False: - X = df["FFT answer"].astype(str) + X_temp = df["FFT answer"].astype(str).apply(remove_punc_and_nums) + X_temp = clean_empty_features(X_temp) + print(f"After preprocessing, shape of X is {X_temp.shape}") + indices = X_temp.index + X = df["FFT answer"].astype(str).filter(indices) if additional_features == True: X = pd.merge(X, df[["FFT_q_standardised"]], left_index=True, right_index=True) X = X.reset_index() diff --git a/pxtextmining/params.py b/pxtextmining/params.py index e377390..b31698e 100644 --- a/pxtextmining/params.py +++ b/pxtextmining/params.py @@ -1,38 +1,36 @@ -dataset = "datasets/hidden/merged_20230602.csv" +dataset = "datasets/hidden/merged_230608.csv" -random_state = 99 +random_state = 42 model_name = "distilbert-base-uncased" q_map = { - "Please tell us why": "nonspecific", - "Please tells us why you gave this answer?": "nonspecific", - "FFT Why?": "nonspecific", - "What was good?": "what_good", - "Is there anything we could have done better?": "could_improve", - "How could we improve?": "could_improve", - "What could we do better?": "could_improve", - "Please can you tell us why you gave your answer and what we could have done better?": "nonspecific", - "Please describe any things about the 111 service that\r\nyou were particularly satisfied and/or dissatisfied with": "nonspecific", - "Please describe any things about the 111 service that \nyou were particularly satisfied and/or dissatisfied with": "nonspecific", - "Please describe any things about the 111 service that\nyou were particularly satisfied and/or dissatisfied with": 'nonspecific', - "Nonspecific": 'nonspecific', - "nonspecific": 'nonspecific' - } + "Please tell us why": "nonspecific", + "Please tells us why you gave this answer?": "nonspecific", + "FFT Why?": "nonspecific", + "What was good?": "what_good", + "Is there anything we could have done better?": "could_improve", + "How could we improve?": "could_improve", + "What could we do better?": "could_improve", + "Please can you tell us why you gave your answer and what we could have done better?": "nonspecific", + "Please describe any things about the 111 service that\r\nyou were particularly satisfied and/or dissatisfied with": "nonspecific", + "Please describe any things about the 111 service that \nyou were particularly satisfied and/or dissatisfied with": "nonspecific", + "Please describe any things about the 111 service that\nyou were particularly satisfied and/or dissatisfied with": "nonspecific", + "Nonspecific": "nonspecific", + "nonspecific": "nonspecific", +} +# v6 major_cat_dict = { "General": [ "Labelling not possible", - "Gratitude/ good experience", - "Negative experience", + "Positive experience & gratitude", + "Negative experience & dissatisfaction", "Not assigned", "Organisation & efficiency", "Funding & use of financial resources", - "Collecting patients feedback", ], "Staff": [ - "Non-specific praise for staff", - "Non-specific dissatisfaction with staff", "Staff manner & personal attributes", "Number & deployment of staff", "Staff responsiveness", @@ -45,8 +43,7 @@ "Information directly from staff during care", "Information provision & guidance", "Being kept informed, clarity & consistency of information", - "Service involvement with family/ carers", - "Patient contact with family/ carers", + "Interaction with family/ carers", ], "Access to medical care & support": [ "Contacting services", @@ -59,12 +56,10 @@ "Diagnosis & triage", "Referals & continuity of care", "Admission", - "Length of stay/ duration of care", "Discharge", "Care plans", "Patient records", "Impact of treatment/ care", - "Links with non-NHS organisations", ], "Food & diet": ["Food & drink provision & facilities"], "Category TBC": [ @@ -77,7 +72,6 @@ "Cleanliness, tidiness & infection control", "Sensory experience", "Environment & Facilities", - "Safety & security", "Provision of medical equipment", ], "Mental Health specifics": ["Mental Health Act"], @@ -93,12 +87,12 @@ # v6 20230602 merged_minor_cats = [ "Gratitude/ good experience", -# "Negative experience", + # "Negative experience", "Not assigned", "Organisation & efficiency", -# "Funding & use of financial resources", + # "Funding & use of financial resources", "Non-specific praise for staff", -# "Non-specific dissatisfaction with staff", + # "Non-specific dissatisfaction with staff", "Staff manner & personal attributes", "Number & deployment of staff", "Staff responsiveness", @@ -109,8 +103,8 @@ "Information directly from staff during care", "Information provision & guidance", "Being kept informed, clarity & consistency of information", -# "Service involvement with family/ carers", -# "Patient contact with family/ carers", + # "Service involvement with family/ carers", + # "Patient contact with family/ carers", "Contacting services", "Appointment arrangements", "Appointment method", @@ -118,14 +112,14 @@ "Pain management", "Diagnosis & triage", "Referals & continuity of care", -# "Length of stay/ duration of care", + # "Length of stay/ duration of care", "Discharge", "Care plans", -# "Patient records", -# "Links with non-NHS organisations", + # "Patient records", + # "Links with non-NHS organisations", "Cleanliness, tidiness & infection control", "Safety & security", -# "Provision of medical equipment", + # "Provision of medical equipment", "Service location", "Transport to/ from services", "Parking", @@ -135,7 +129,7 @@ "Mental Health Act", "Equality, Diversity & Inclusion", "Admission", -# "Collecting patients feedback", + # "Collecting patients feedback", "Labelling not possible", "Environment & Facilities", "Supplying & understanding medication", @@ -144,18 +138,14 @@ "Sensory experience", "Impact of treatment/ care", "Negative experience/ dissatisfaction", - "Family/ carers" + "Family/ carers", ] -# v5 20230419 +# v6 20230806 minor_cats = [ - "Gratitude/ good experience", - "Negative experience", "Not assigned", "Organisation & efficiency", "Funding & use of financial resources", - "Non-specific praise for staff", - "Non-specific dissatisfaction with staff", "Staff manner & personal attributes", "Number & deployment of staff", "Staff responsiveness", @@ -166,8 +156,6 @@ "Information directly from staff during care", "Information provision & guidance", "Being kept informed, clarity & consistency of information", - "Service involvement with family/ carers", - "Patient contact with family/ carers", "Contacting services", "Appointment arrangements", "Appointment method", @@ -175,13 +163,10 @@ "Pain management", "Diagnosis & triage", "Referals & continuity of care", - "Length of stay/ duration of care", "Discharge", "Care plans", "Patient records", - "Links with non-NHS organisations", "Cleanliness, tidiness & infection control", - "Safety & security", "Provision of medical equipment", "Service location", "Transport to/ from services", @@ -192,7 +177,6 @@ "Mental Health Act", "Equality, Diversity & Inclusion", "Admission", - "Collecting patients feedback", "Labelling not possible", "Environment & Facilities", "Supplying & understanding medication", @@ -200,6 +184,14 @@ "Food & drink provision & facilities", "Sensory experience", "Impact of treatment/ care", + # "Psychological therapy arrangements", + # "Existence of services", + # "Choice of services", + # "Out of hours support (community services)", + # "Learning organisation", + "Interaction with family/ carers", + "Negative experience & dissatisfaction", + "Positive experience & gratitude", ] sentiment_dict = { diff --git a/pxtextmining/pipelines/multilabel_pipeline.py b/pxtextmining/pipelines/multilabel_pipeline.py index cc1df35..b19fe7c 100644 --- a/pxtextmining/pipelines/multilabel_pipeline.py +++ b/pxtextmining/pipelines/multilabel_pipeline.py @@ -357,15 +357,16 @@ def run_two_layer_sklearn_pipeline( if __name__ == "__main__": - run_bert_pipeline(additional_features = True, path = 'test_multilabel/merged_cats_0607', target = merged_minor_cats) - run_sklearn_pipeline(additional_features = True, target= merged_minor_cats, models_to_try = ["xgb"], path = 'test_multilabel/merged_cats_0607/xgb', - include_analysis=True) - run_svc_pipeline( - additional_features=True, - target=merged_minor_cats, - path="test_multilabel/merged_cats_0607/svc", - include_analysis=True - ) - run_sklearn_pipeline(additional_features = True, target= merged_minor_cats, models_to_try = ["svm"], path = 'test_multilabel/merged_cats_0607/svc_2', - include_analysis=True) + + # run_sklearn_pipeline(additional_features = True, target= minor_cats, models_to_try = ["xgb"], path = 'test_multilabel/v6_230806/xgb', + # include_analysis=True) + # run_svc_pipeline( + # additional_features=True, + # target=minor_cats, + # path="test_multilabel/v6_230806/svc", + # include_analysis=True + # ) + run_bert_pipeline(additional_features = True, path = 'test_multilabel/v6_230806', target = minor_cats, include_analysis=True) + # run_sklearn_pipeline(additional_features = True, target= minor_cats, models_to_try = ["svm"], path = 'test_multilabel/v6_230806/svc_2', + # include_analysis=True) # run_two_layer_sklearn_pipeline() diff --git a/tests/conftest.py b/tests/conftest.py index 81158c6..700d8a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,10 @@ import pandas as pd import numpy as np from unittest.mock import Mock +from pxtextmining.params import minor_cats, q_map +import random +import string + @pytest.fixture def grab_test_X_additional_feats(): @@ -22,848 +26,52 @@ def grab_test_X_additional_feats(): }, } text_X_additional_feats = pd.DataFrame(data_dict) - text_X_additional_feats.index.name = 'Comment ID' + text_X_additional_feats.index.name = "Comment ID" return text_X_additional_feats @pytest.fixture def mock_read_csv(mocker, test_raw_data): mock = Mock() - mocker.patch('pandas.read_csv', return_value=test_raw_data) + mocker.patch("pandas.read_csv", return_value=test_raw_data) return mock @pytest.fixture def test_raw_data(): - data_dict = { - "Comment ID": { - 0: "Q1", - 1: "Q2", - 2: "Q3", - 3: "Q4", - 4: "Q5", - 5: "Q6", - 6: "Q7", - 7: "Q8", - 8: "Q9", - 9: "Q10", - }, - "Trust": { - 0: "Trust A", - 1: "Trust A", - 2: "Trust A", - 3: "Trust A", - 4: "Trust A", - 5: "Trust A", - 6: "Trust A", - 7: "Trust A", - 8: "Trust A", - 9: "Trust A", - }, - "Respondent ID": { - 0: "Anon", - 1: "Anon", - 2: "Anon", - 3: "Anon", - 4: "Anon", - 5: "Anon", - 6: "Anon", - 7: "Anon", - 8: "Anon", - 9: "Anon", - }, - "Date": { - 0: "30/09/2021", - 1: "30/09/2021", - 2: "30/09/2021", - 3: "30/09/2021", - 4: "30/09/2021", - 5: "30/09/2021", - 6: "30/09/2021", - 7: "30/09/2021", - 8: "30/09/2021", - 9: "30/09/2021", - }, - "Service type 1": { - 0: "Department A", - 1: "Department A", - 2: "Department A", - 3: "Department A", - 4: "Department A", - 5: "Department A", - 6: "Department A", - 7: "Department A", - 8: "Department A", - 9: "Department A", - }, - "Service type 2": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "FFT categorical answer": { - 0: 1, - 1: 1, - 2: 1, - 3: 1, - 4: 2, - 5: 1, - 6: 1, - 7: 1, - 8: 2, - 9: 2, - }, - "FFT question": { - 0: "FFT Why?", - 1: "FFT Why?", - 2: "FFT Why?", - 3: "FFT Why?", - 4: "FFT Why?", - 5: "FFT Why?", - 6: "FFT Why?", - 7: "FFT Why?", - 8: "FFT Why?", - 9: "FFT Why?", - }, - "FFT answer": { - 0: "Did not answer", - 1: "Nothing.", - 2: "Did not answer", - 3: "None.", - 4: "Cue waiting time by an hour.", - 5: "Did not answer", - 6: "None. Quite satisfied.", - 7: "Cut the waiting time from 2 hours before appointment to 1 hour depending which hospital.", - 8: "Nothing at all.", - 9: "Nothing.", - }, - "Person identifiable info?": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Comment sentiment": { - 0: np.NaN, - 1: 1.0, - 2: np.NaN, - 3: 1.0, - 4: 3.0, - 5: np.NaN, - 6: 2.0, - 7: 3.0, - 8: 1.0, - 9: 1.0, - }, - "Gratitude/ good experience": { - 0: np.NaN, - 1: 1.0, - 2: np.NaN, - 3: 1.0, - 4: np.NaN, - 5: np.NaN, - 6: 1.0, - 7: np.NaN, - 8: 1.0, - 9: 1.0, - }, - "Negative experience": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Not assigned": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Organisation & efficiency": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Funding & use of financial resources": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Non-specific praise for staff": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Non-specific dissatisfaction with staff": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Staff manner & personal attributes": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Number & deployment of staff": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Staff responsiveness": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Staff continuity": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Competence & training": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Unspecified communication": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Staff listening, understanding & involving patients": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Information directly from staff during care": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Information provision & guidance": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Being kept informed, clarity & consistency of information": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Service involvement with family/ carers": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Patient contact with family/ carers": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Contacting services": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Appointment arrangements": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: 1.0, - 8: np.NaN, - 9: np.NaN, - }, - "Appointment method": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Timeliness of care": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: 1.0, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Pain management": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Diagnosis & triage": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Referals & continuity of care": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Length of stay/ duration of care": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Discharge": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Care plans": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Patient records": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Links with non-NHS organisations": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Cleanliness, tidiness & infection control": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Safety & security": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Provision of medical equipment": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Service location": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Transport to/ from services": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Parking": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Electronic entertainment": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Feeling safe": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Patient appearance & grooming": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Mental Health Act": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Equality, Diversity & Inclusion": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Admission": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Collecting patients feedback": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Column to outline any additional comment content which doesn't map to a topic.": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Safeguarding concerns?": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Labelling not possible": { - 0: 1.0, - 1: 0.0, - 2: 1.0, - 3: 0.0, - 4: 0.0, - 5: 1.0, - 6: 0.0, - 7: 0.0, - 8: 0.0, - 9: 0.0, - }, - "Environment & Facilities": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Supplying & understanding medication": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Activities & access to fresh air": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Food & drink provision & facilities": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Sensory experience": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Impact of treatment/ care": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Psychological therapy arrangements": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Existence of services": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Choice of services": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Out of hours support (community services)": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - "Learning organisation": { - 0: np.NaN, - 1: np.NaN, - 2: np.NaN, - 3: np.NaN, - 4: np.NaN, - 5: np.NaN, - 6: np.NaN, - 7: np.NaN, - 8: np.NaN, - 9: np.NaN, - }, - } + cols = [ + "Comment ID", + "Trust", + "Respondent ID", + "Date", + "Service Type 1", + "Service type 2", + "FFT categorical answer", + "FFT question", + "FFT answer", + "Comment sentiment", + ] + cols.extend(minor_cats) + data_dict = {} + for col in cols: + row = [] + if col not in minor_cats: + if col in ["FFT categorical answer", "Comment sentiment"]: + for i in range(5): + row.append(random.randint(1, 5)) + elif col == "FFT question": + for i in range(5): + row.append(random.choice(list(q_map.keys()))) + else: + for i in range(5): + row.append( + "".join( + random.choices(string.ascii_uppercase + string.digits, k=5) + ) + ) + else: + for i in range(5): + row.append(random.choice([np.NaN, 1])) + data_dict[col] = row data = pd.DataFrame(data_dict) return data