In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import pickle as pk

In [2]:
import unidecode
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

import nltk
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

from nltk.tokenize import RegexpTokenizer

import re
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

from wordcloud import WordCloud

nltk.download('stopwords') 
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\personnel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\personnel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\personnel\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Quelques fonctions utiles

In [3]:
specific_terms = [
    'douleur abdominale', 'douleur thoracique', 'douleur lombaire', 'conjonctive coloree',
    'tension arterielle', 'frequence cardiaque', 'frequence respiratoire'
]
stop_words = set(stopwords.words('french'))
def preprocess_text(text, specific_terms=specific_terms):
    # Supprimer les caractères spéciaux et la ponctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convertir en minuscules
    text = text.lower()
    # Supprimer les accents
    text = unidecode.unidecode(text)
    # Supprimer les mots vides (stop words)
    mots_a_exclure = ['a', 'bilan', 'evoluant', 'debut', 'remonterait', 'marque', 'mois', 'linstallation', 'dune', 'patient', 'gauche', 'droit', 'droite', 'semaine', 'semaines', 'jour', 'jours', 'jrs', 'contexte', 'ras', 'consulte', 'consultation', 'jours', 'jrs', 'avoir','deux', 'selon', 'contre','vendredi','plus','depuis','ans','entre','ont', 'vos', 'aurions', 'ne', 'es', 'fussent', 'auriez', 'les', 'au', 'aurait', 'aux', 'seraient', 'samedi','récit', 'plusieurs', 'avons', 'aura', 'ayons', 'avez', 'as', 'ayants', 'mais', 'n', 'le', 'eut', 'aies', 'ses', 'aie', 'avions', 'serez', 'du', 'je', 'auront', 'eussiez', 'ta', 'étés', 'étaient', 'serai', 'avaient', 'serait', 'étiez', 'étais', 'fussiez', 'lui', 'elle', 'eûmes', 'ayant', 'seras', 'était', 'aurais', 'aurez', 'fusse', 'fussions', 'on', 'leur', 'sont', 'pas', 'ayantes', 'eue', 'fût', 'vous', 'sera', 'même', 'y', 'fûtes', 'de', 'eurent', 'eu', 'ou', 'des', 'soit', 'qu', 'il', 'moi', 'et', 'ai', 'soyons', 'ils', 'ma', 'que', 'nos', 'la', 'tes', 'ces', 'eusse', 'ait', 'se', 'ton', 'aviez', 'eussions', 'mes', 'eussent', 'pour', 'fut', 'ce', 'eûtes', 'nous', 'l', 'sommes', 'c', 't', 'un', 'ayez', 'eues', 'm', 'serions', 'seriez', 'sa', 'j', 'furent', 'avait', 's', 'en', 'd', 'serais', 'étée', 'à', 'qui', 'votre', 'aurai', 'par', 'êtes', 'eux', 'étantes', 'tu', 'sois', 'fus', 'avec', 'sur', 'eusses', 'une', 'eût', 'est', 'toi', 'soyez', 'aurons', 'me', 'étées', 'seront', 'avais', 'son', 'mon', 'étant', 'auraient', 'aient', 'été', 'étants', 'étante', 'dans', 'suis', 'notre', 'te', 'eus', 'étions', 'fûmes', 'auras', 'soient', 'fusses', 'ayante', 'serons']  # Remplacez par votre liste de mots à exclure
    
    # Remplacer les expressions spécifiques par un token unique
    for term in specific_terms:
        text = re.sub(r'\b' + term + r'\b', term.replace(' ', '_'), text)

    words = word_tokenize(text)
    
    filtered_text = [mot for mot in words if mot not in stop_words and mot not in mots_a_exclure]
    
    # Lemmatisation
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    
    return ' '.join(lemmatized_text)
    #return phrase


In [4]:
# Frequence des mots
def frequence_mot(texte, nb=None):   
    # tokenisation du texte
    mots_tokenise = word_tokenize(' '.join(texte))
    print(mots_tokenise)
        
    # Compter les mots fréquents dans les titres
    mots_freq= FreqDist(mots_tokenise)
    # Obtenir les mots fréquents avec leurs occurrences
    mots_occurrences = mots_freq.most_common(nb)
    # Créer un DataFrame à partir des mots fréquents avec leurs occurrences
    df = pd.DataFrame(list(mots_occurrences), columns=['Mot', 'Occurrences'])
    print(df)

### Fonction Wrangling

In [32]:
def wrangling(df):
    # Suppression des colonnes vides
    vides = ['Fréquence cardiaque (FC)', 'Fréquence respiratoire (FR)', 'Conscience (SCG)', 'Glycemie', 'allergies_imm',
            'autres_gyn', 'autres_imm', 'chirurgie']
    df.drop(vides, axis=1, inplace=True)

    selected_data = ['clinique', 'motif_consultation', 'enquete_systeme', 'exam_physique', 'histoire_maladie',
                     'IMC', 'IT', 'Poids', 'Saturation en O2', 'Taille', 'Température',
                    'Tension Arterielle (BD)', 'Tension Arterielle (BG)', 'age', 'sexe'
                    ]
    df = df[selected_data]
    # Selection des colonnes categorielles
    data_category = df[df.select_dtypes(include=['object']).columns].fillna('RAS')
    
    # Reformattage des données mal renseignées
    for i, row in df.iterrows():
        if row['age'] > 999:
            df['age'][i] = 2024 - row['age']
            
        if row['age'] > 70 and row['Taille'] < 75:
            df['Taille'][i] = row['Taille'] + 100

        if row['Saturation en O2'] > 100 :
            df['Saturation en O2'][i] = 99
        
    # Remplissage des valeurs manquantes
    df['Température'].fillna(37, inplace=True)
    #df['alcool_imm'].fillna(0, inplace=True)
    df['Saturation en O2'].fillna(99, inplace=True)

    # Remplissage des valeurs numeriques vides par leurs moyens 
    #for col in df.select_dtypes(include=['int32', 'int64', 'float32', 'float64']).columns:
    #    df[col].fillna(df[col].mean(), inplace=True)

    # Suppressions des valeurs considerées inutiles 
    for i, row in data_category.iterrows():
        str(row[df.select_dtypes(include=['object']).columns]).replace('+','')
        str(row[df.select_dtypes(include=['object']).columns]).replace('/',' ')
        str(row[df.select_dtypes(include=['object']).columns]).replace(':',' ')
        str(row[df.select_dtypes(include=['object']).columns]).replace('bilan',' ')
        str(row[df.select_dtypes(include=['object']).columns]).replace('Bilan',' ')

    # Appliquer le prétraitement aux colonnes textuelles
    #data_category = data_category.astype(str).apply(preprocess_text)
    data_category['clinique'] = data_category['clinique'].astype(str).apply(preprocess_text)
    data_category['motif_consultation'] = data_category['motif_consultation'].astype(str).apply(preprocess_text)
    data_category['histoire_maladie'] = data_category['histoire_maladie'].astype(str).apply(preprocess_text)
    data_category['enquete_systeme'] = data_category['enquete_systeme'].astype(str).apply(preprocess_text)
    data_category['exam_physique'] = data_category['exam_physique'].astype(str).apply(preprocess_text)
    #data_category['conduite_a_tenir'] = data_category['conduite_a_tenir'].astype(str).apply(preprocess_text)
    #data_category['ethnie'] = data_category['ethnie'].astype(str).apply(preprocess_text)
    data_category['sexe'] = data_category['sexe'].astype(str).apply(preprocess_text)
    
    # Regroupement des colonnes textuelles en une colonne unique
    text_data = data_category[['clinique', 'motif_consultation', 'enquete_systeme', 'exam_physique', 'histoire_maladie']]
    text_data['symptomes'] = ' '
    for i, row in text_data.iterrows():
        mots = []
        mots_vide = ['ras', 'ra', 'RAS']
        mots += [mot for mot in row['enquete_systeme'].split(',') if mot not in mots and  mot not in mots_vide]
        mots += [mot for mot in row['exam_physique'].split(',') if mot not in mots and mot not in mots_vide]
        mots += [mot for mot in row['histoire_maladie'].split(',') if mot not in mots and mot not in mots_vide]
        mots += [mot for mot in row['motif_consultation'].split(',') if mot not in mots and mot not in mots_vide]
        mots += [mot for mot in row['clinique'].split(',') if mot not in mots and mot not in mots_vide]

        # mots.append(str(row['target']))

        for mot in mots:
            text_data['symptomes'][i] += ' ' + mot

    # Remplacement des termes specifiques
    def replace_specific_terms(text, replacements):
        for old_term, new_term in replacements.items():
            text = re.sub(r'\b' + re.escape(old_term) + r'\b', new_term, text)
        return text
    
    # Dictionnaire des termes spécifiques à remplacer
    replacements = {
        'doulur': 'douleur',
        'abdominal': 'abdominale',
        'cephales': 'cephale',
        'douleurs': 'douleur',
        'vertiges': 'vertige',
        'conjonctives': 'conjonctive',
        'conjonction': 'conjonctive',
        'scleres': 'sclere',
    }

    # Remplacer les termes spécifiques dans la colonne symptomes
    text_data['symptomes'] = text_data['symptomes'].apply(lambda x: replace_specific_terms(x, replacements))

    cleaned_data = df[['IMC', 'IT', 'Poids', 'Saturation en O2', 'Taille', 'Température',
                    'Tension Arterielle (BD)', 'Tension Arterielle (BG)', 'age']
                    ]
    cleaned_data['sexe'] = data_category['sexe']
    cleaned_data['symptomes'] = text_data['symptomes']

    # Pretraitement de la colonne symptome
    my_stemmer = LancasterStemmer()
    stemmed = [[my_stemmer.stem(word) for word in review.split()] for review in text_data['symptomes'] ]
    stemmed_concat = [' '.join(review) for review in stemmed]

    """ # Suppression des colonnes pas très importantes
    df.drop(['id', 'motif_consultation', 'histoire_maladie', 'clinique', 'enquete_systeme', 'exam_physique', 'conduite_a_tenir',
                'ethnie', 'profession', 'quartier', 'religion', 'statut_matrimonial', 'medicaux', 'alcool_imm', 'target', 'symptomes', 
                'diagnostique', ], axis=1, inplace=True)  # A supprimer
    
    
    # Encodage des données
    le = LabelEncoder()
    df['sexe'] = le.fit_transform(df['sexe'])

    # Normaliser les données numériques
    scaler = StandardScaler()
    data_num_scaled = scaler.fit_transform(df) 

    # Séparer les textes et les labels
    texts = text_data['symptomes']

    ### Vectoriser les textes 
    # Vectorisation TF-IDF

    tfidf_vectorizer = TfidfVectorizer()
    text_tranform = tfidf_vectorizer.fit_transform(texts)
    
    # Combiner les caractéristiques textuelles et numériques pour les ensembles d'entraînement et de test
    X_combined = sp.hstack((text_tranform, data_num_scaled))  """

    return cleaned_data 

### Fonction pipeline

In [33]:
# Script pour serialiser et sauvegarder la fonction preprocess
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


def preprocess_num_data():
    # categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    # Créer une pipeline pour les transformations
    pipeline = Pipeline([
        # Encodage des variables catégorielles
        ('encoder', OrdinalEncoder()),
        # Imputation des valeurs manquantes
        ('imputer', SimpleImputer(strategy='most_frequent')),
        # Mise à l'échelle des caractéristiques numériques
        ('scaler', StandardScaler()),
    ])

    # Sauvegarde de la fonction sous pkl
    pk.dump(pipeline, open(f'../models/pipeline_num_preprocess.pkl', 'wb'))
    return pipeline

def preprocess_text_data():
    pipeline = Pipeline([
         # Pretraitement des texts avec tfid
        ('tfid', TfidfVectorizer()),
    ])
    pk.dump(pipeline, open(f'../models/pipeline_text_preprocess.pkl', 'wb'))
    return pipeline

In [34]:
# Script pour charger la fonction sérialisée
def load_preprocess_data(filepath):
    preprocess_function = pk.load(open(filepath, 'rb'))
    return preprocess_function

In [45]:
donnees = pd.read_csv('../data/Patients_data.csv')
clean_data = wrangling(donnees[14:15])
clean_data['symptomes']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(vides, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['sexe'] = data_category['sexe']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['symptomes'] = text_data['symptomes']


14      courbature polyarthralgie cephale toux beg c...
Name: symptomes, dtype: object

In [46]:
data_num_preprocessed = preprocess_num_data()
data_num = data_num_preprocessed.fit_transform(clean_data.drop(['symptomes'], axis=1))

In [47]:
#data_text_preprocessed = preprocess_text_data()
#data_text = data_text_preprocessed.fit_transform(clean_data['symptomes'])
data_text_p = load_preprocess_data('../models/pipeline_text_preprocess1.pkl')
data_text = data_text_p.transform(clean_data['symptomes'])

data_num_p = load_preprocess_data('../models/pipeline_num_preprocess1.pkl')
data_num = data_num_p.fit_transform(clean_data.drop(['symptomes'], axis=1))

In [48]:
data_num.shape
data_text.shape

(1, 677)

In [49]:
X_combined = sp.hstack((data_text, data_num))
X_combined

<1x687 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in COOrdinate format>

In [50]:
data_text.shape

(1, 677)

In [51]:
# importer modele de prediction
model = load_preprocess_data('../models/SVM.pkl')
model

In [52]:
model.predict(X_combined)

array([1], dtype=int64)

In [28]:
pipeline = load_preprocess_data('../models/pipeline_preprocess.pkl')
pipeline

In [None]:
# importer modele de prediction