## Algorithm Adaptation

In [177]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

### Pre processing


In [178]:
data_garmin_df = pd.read_csv('data/Garmin_Connect.csv')
data_samsung_df = pd.read_csv('data/Samsung_Health.csv')
data_huawei_df = pd.read_csv('data/Huawei_Health.csv')

data = pd.concat([data_garmin_df, data_samsung_df, data_huawei_df], ignore_index=True)

print(data.head())

                                                                                                  data  \
0  Contrairement aux idées reçues le traceur GPS est très précis, dumoins avec vivoactive 3 Music.....   
1                                                                 Application très pratique et fiable.   
2                                                                                     jadore ma montre   
3                                Super application, je l'utilise synchronisé avec ma fenix3 et j'adore   
4                                                                                              Super !   

   score  rating  bug_report  feature_request  user_experience  
0      5       1           0                0                1  
1      5       1           0                0                0  
2      5       1           0                0                0  
3      5       1           0                0                1  
4      5       1           0          

Employing non-alphabetic filtering, lowercasing, stop word removal, and stemming.

In [179]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
ps = PorterStemmer()
from nltk.stem import WordNetLemmatizer 

def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    sw = set(stopwords.words('french'))
    # get review and remove non alpha chars
    review = re.sub('[^a-zA-Z]', ' ', text)
    # to lower-case
    review = review.lower()
    # split into tokens, apply stemming and remove stop words
    #review = ' '.join([lemmatizer.lemmatize(w) for w in review.split()])
    return ' '.join([ps.stem(w) for w in review.split() if w not in sw])


data['text_clean'] = data['data'].apply(lambda x: clean_text(x))
print(data['text_clean'].head())

0    contrair id re ue traceur gp tr pr ci dumoin vivoact music tr motiv suit conseil garmin
1                                                                    applic tr pratiqu fiabl
2                                                                                jador montr
3                                                   super applic utilis synchroni fenix ador
4                                                                                      super
Name: text_clean, dtype: object


Splitting into test and train


In [180]:
y = data[['rating','bug_report','feature_request','user_experience']]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['text_clean'],y , test_size=0.2,stratify=y, random_state=42)
print(X_train,y_train)

3682    aim bien cett applic comm a peut dire combien a fait mar v lo sport plein autr aussi pouvez choisir
3067                                                                                 bonn applic fi tr bien
2487                                                          dommag puiss mesur stress rythm cardiaqu note
4622                                                                                                  super
5160                                                  fonctionn bien depui mise jour r veill matin pad fait
                                                       ...                                                 
420                                                                                                    cool
5848                                                                              temp sommeil rest parfait
5191                                          a nouveau acc diff rent fond cran plu connexion phone one plu
3166                        

## Pipeline

In [181]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
nltk.download('stopwords')

final_stopwords_list = stopwords.words('english') + stopwords.words('french')
tfidf_vectorizer = TfidfVectorizer(max_df=0.8,
                                   max_features=200000,
                                   min_df=0.2,
                                   stop_words=final_stopwords_list,
                                   use_idf=True)



[nltk_data] Downloading package stopwords to C:\Users\Marta
[nltk_data]     Mariz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [182]:
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)


In [183]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

clf = DecisionTreeClassifier(random_state=0)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


In [184]:
from sklearn.metrics import precision_recall_fscore_support

scores = pd.DataFrame()

def score(y_true, y_pred):
    """Calculate precision, recall, and f1 score"""

    metrics = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    performance = {'precision': metrics[0], 'recall': metrics[1], 'f1': metrics[2]}
    return performance


print(score(y_test, y_pred))

{'precision': 0.2816000845487212, 'recall': 0.4629042485732403, 'f1': 0.35017627094141496}


  _warn_prf(average, modifier, msg_start, len(result))
