## Pre-processing



### Importing the data 



In [1]:
import nltk
import pandas as pd

# Read the data from the file
data_garmin_df = pd.read_csv('data/Garmin_Connect.csv')
data_samsung_df = pd.read_csv('data/Samsung_Health.csv')
data_huawei_df = pd.read_csv('data/Huawei_Health.csv')

data = pd.concat([data_garmin_df, data_samsung_df, data_huawei_df], ignore_index=True)
#data.to_csv('data/concatenated_data.csv', sep='\t', encoding='utf-8')

print(data.head(10))
print("\n Number of rows: " + str(len(data)))



                                                data  score  rating  \
0  Contrairement aux idées reçues le traceur GPS ...      5       1   
1               Application très pratique et fiable.      5       1   
2                                   jadore ma montre      5       1   
3  Super application, je l'utilise synchronisé av...      5       1   
4                                            Super !      5       1   
5  Application très pratique et très simple d'uti...      3       1   
6  Suivis du sommeil cardio nombre de pas avec la...      5       1   
7                                  Sympa et précis !      5       1   
8                                     Très satisfait      5       1   
9  bonjour, le calendrier ne se synchronise plus....      2       0   

   bug_report  feature_request  user_experience  
0           0                0                1  
1           0                0                0  
2           0                0                0  
3           0     

### Tokenization and removal of stopwords

*Tokenization* is the process of splitting an input text into tokens (words or other relevant elements, such as punctuation, empty strings). We will use the result as a basis to predict a label.


In [2]:
from nltk.tokenize import word_tokenize #principal tokenization class from nltk API
from nltk.stem import SnowballStemmer   #Stemming method
import re                               #regex library
nltk.download('punkt')

## IMPLEMENT LEMMATIZATION

corpus = []

for index, row in data.iterrows():
    review = re.sub('\*', '', row["data"]) # get data, substitute asterisks for empty string, put into review
    review = re.sub('[^a-zA-Z]', ' ', review) # from review, remove all non-alphabetic characters
    review = re.sub('[^\w\s]', '', review) # remove punctuation from review
    review = ' '.join([SnowballStemmer('french').stem(w) for w in word_tokenize(review.lower(), language='french')]) # apply stemming
    corpus.append(review)

#print(corpus)

data = data.assign(token=corpus)

print(data.head())


[nltk_data] Downloading package punkt to C:\Users\Marta
[nltk_data]     Mariz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                data  score  rating  \
0  Contrairement aux idées reçues le traceur GPS ...      5       1   
1               Application très pratique et fiable.      5       1   
2                                   jadore ma montre      5       1   
3  Super application, je l'utilise synchronisé av...      5       1   
4                                            Super !      5       1   

   bug_report  feature_request  user_experience  \
0           0                0                1   
1           0                0                0   
2           0                0                0   
3           0                0                1   
4           0                0                0   

                                               token  
0  contrair aux id e re ue le traceur gp est tr s...  
1                      appliqu tr s pratiqu et fiabl  
2                                     jador ma montr  
3  sup appliqu je l utilis synchron avec ma fe

### Separation between train and test datasets

Separate in adequate proportions to avoid the overfitting of the modules the data between features and targets. In this case there will be 2 different separations, one for the original multilabel problem and another for the mold into just a multiclass problem. To ensure a more even tag distribution, we must use the *stratify* hyper-parameter

In [3]:
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.model_selection import train_test_split
x = data['data']  # try to include / exclude score and check if it yields better results
y = data[['rating', 'bug_report', 'feature_request', 'user_experience']]

X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(4800,)
(1200,)
(4800, 4)
(1200, 4)


# Approaches to multilabel classification we will consider:

#### Problem transformation
1. Binary Relevance (consider each label as a separate single class classification  problem)
2. Classifier Chains
3. Label powerset

#### Adapted Algorithms

#### Ensemble methods

#### Finding the best parameters for TF-IDF Transformer and CountVectorizer

These two preprocessing steps will be present in all of our pipelines, so lets check what hyperparameters are best for them

In [4]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
nltk.download('stopwords')

final_stopwords_list = stopwords.words('french') + stopwords.words('english')
# Define the hyperparameters to tune for the TfidfTransformer, which will be the same for every method approach as this preprocessor is always used
tfidf_params = {
    'use_idf': [True, False],
    'smooth_idf': [True, False],
    'sublinear_tf': [True, False],
    'norm': ['l1', 'l2']
}

#Define the hyperparameters to tune for the CountVectorizer, which will be the same for every method approach as this preprocessor is always used
count_vectorizer_params = {
    'max_df': [0.5, 0.75, 1.0],
    'min_df': [0.2, 0.25, 0.3],
    'max_features': [None, 10000, 20000],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'stop_words': [None, final_stopwords_list],
}

grid_params = {
        'vect__max_df': count_vectorizer_params['max_df'],
        'vect__ngram_range': count_vectorizer_params['ngram_range'],
        'vect__min_df': count_vectorizer_params['min_df'],
        'vect__stop_words': count_vectorizer_params['stop_words'],
        'tfidf__use_idf': tfidf_params['use_idf'],
        'tfidf__smooth_idf': tfidf_params['smooth_idf'],
        'tfidf__sublinear_tf': tfidf_params['sublinear_tf'],
        'tfidf__norm': tfidf_params['norm'],
        
}


# Create the pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
])

# Create the grid search
grid_search = GridSearchCV(pipeline, grid_params, cv=2, n_jobs=-1, verbose=1)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters:" , grid_search.best_params_)
print("Best score: ", grid_search.best_score_)





[nltk_data] Downloading package stopwords to C:\Users\Marta
[nltk_data]     Mariz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Fitting 2 folds for each of 864 candidates, totalling 1728 fits


KeyboardInterrupt: 

### Best parameters for count_vectorizer and tf_idf_transformer
So we don't have to run the gridSearch multiple times 

Best parameters: 
- 'tfidf__norm': 'l1'
- 'tfidf__smooth_idf': True
- 'tfidf__sublinear_tf': True
- 'tfidf__use_idf': True
- 'vect__max_df': 0.7
- 'vect__min_df': 0.3
- 'vect__ngram_range': (1, 2)
- 'vect__stop_words': final_stopwords_list



In [5]:
# create a new count vectorizer with the best parameters
count_vectorizer = CountVectorizer(
    max_df=0.8,
    ngram_range=(1, 2),
    min_df=0.2,
    stop_words=final_stopwords_list,
)

tf_idf_transformer = TfidfTransformer(
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True,
    norm='l1',
)

## Pipeline

Text preprocessing, tokenizing and filtering of stopwords are all included in CountVectorizer, which builds a dictionary of features and transforms documents to feature vectors; 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn import metrics


def make_pipeline(clf_param_grid, clf):
    pipeline = Pipeline([
        ('vect', count_vectorizer),
        ('tfidf', tf_idf_transformer),
        ('clf', clf)
    ])

    # Define the GridSearchCV object to tune the hyperparameters
    grid_params = [
        {
        **clf_param
        }
        for clf_param in clf_param_grid
    ]

    # Define the GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid=grid_params, cv=5, n_jobs=-1)


    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train, y_train)

    # Get the best hyperparameters and best model
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    print("Best parameters: ", best_params)
    print("Best model: ", best_model)
    

    # Evaluate the best model on the test data
    y_pred = best_model.predict(X_test)
    #Apply classification report
    print(metrics.classification_report(y_test, y_pred, digits=3))



# Adaptive algorithms


In [7]:
# using binary relevance
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.neighbors import KNeighborsClassifier

clf = DecisionTreeClassifier() # what is this?

param_grid_adaptive_algorithms = [
    {
        'clf': [DecisionTreeClassifier()],
        'clf__criterion': ["gini", "entropy", "log_loss"],
        'clf__min_samples_split': [1, 2, 10, 20],
        'clf__max_depth': [None, 1, 2, 5, 10, 20, 50, 100],
        'clf__class_weight': [None, 'balanced'],
        'clf__random_state': [None, 42],
        'clf__max_features': [None, 'auto', 'sqrt', 'log2'],
    },
    {
        'clf': [ExtraTreeClassifier()],
        'clf__criterion': ["gini", "entropy", "log_loss"],
        'clf__min_samples_split': [1, 2, 10, 20],
        'clf__max_depth': [None, 1, 2, 5, 10, 20, 50, 100],
        'clf__class_weight': [None, 'balanced'],
        'clf__random_state': [None, 42],
        'clf__max_features': [None, 'auto', 'sqrt', 'log2'],
    },
    {
        'clf': [KNeighborsClassifier()],
        'clf__n_neighbors': [1, 2, 5, 10, 20, 50, 100],
        'clf__weights': ['uniform', 'distance'],
        'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'clf__metric': ['minkowski', 'euclidean', 'manhattan'],
        'clf__p': [1, 2, 3, 4, 5],        
    },
    {
        'clf': [RidgeClassifierCV()],
        'clf__alphas': [0.1, 1.0, 10.0],
        'clf__fit_intercept': [True, False],
        'clf__normalize': [True, False],
        'clf__scoring': [None, 'accuracy', 'precision', 'recall', 'f1'],
        'clf__cv': [None, 3, 5, 10],
        'clf__class_weight': [None, 'balanced'],
        'clf__store_cv_values': [False, True],
    }  
]

make_pipeline(param_grid_adaptive_algorithms, clf)

## Problem transformation: Binary Relevance

In [25]:
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

clf = BinaryRelevance(GaussianNB())

param_grid_binary_relevance = [
    {
        'clf': [clf],
        'clf__classifier__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
    },
    {
        'clf': [BinaryRelevance(RandomForestClassifier())],
        'clf__classifier__n_estimators': [50, 100],
        'clf__classifier__max_depth': [10, 20],
    },
    #{
    #    'clf': [BinaryRelevance(LogisticRegression())],
    #    'clf__classifier__penalty': ['l1', 'l2'],
    #    'clf__classifier__C': [0.1, 1, 10],
    #    'clf__classifier__solver': ['liblinear', 'saga']
    #}
]

make_pipeline(param_grid_binary_relevance, clf)

KeyboardInterrupt: 

## Hyperparameter tuning

In [None]:


#why this here 