<a href="https://colab.research.google.com/github/brandontan99/Self_Love_App/blob/master/OneVsRest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/brandontan99/Self_Love_App.git
%cd Self_Love_App

Cloning into 'Self_Love_App'...
remote: Enumerating objects: 178, done.[K
remote: Counting objects: 100% (178/178), done.[K
remote: Compressing objects: 100% (153/153), done.[K
remote: Total 178 (delta 98), reused 50 (delta 22), pack-reused 0[K
Receiving objects: 100% (178/178), 1.84 MiB | 15.19 MiB/s, done.
Resolving deltas: 100% (98/98), done.
/content/Self_Love_App


In [None]:
from data_cleaning import *
from Data_Normalization import *
from feature_selection import *

import pandas as pd

import sklearn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
import statistics
import joblib
from copy import deepcopy

In [None]:
    #Feature selection
    from sklearn.preprocessing import LabelEncoder
    from sklearn.feature_selection import chi2, SelectKBest
    # to do the chi2 analysis
    df = pd.read_csv("WID3006 ML Questionnaire.csv")
    df = data_cleaning(df)
    # label encoding is used instead of one-hot encoding because one-hot encoding will create many features for one question
    # but label encoding is able to perform chi-square test between a single question as a whole and the hobby  
    x_df, y_df = label_encoding(df)
    # only evaluate on train data to prevent data leakage
    x_train_label_encoded, _, y_train, _ = train_test_split(x_df.to_numpy(), y_df.to_numpy(), test_size=0.2, random_state=1)
    chi2_result = chi2_analysis(x_train_label_encoded, x_df, y_train)

    # to do Recursive Feature Elimination(rfe) feature selection
    df = pd.read_csv("WID3006 ML Questionnaire.csv")
    df = data_cleaning(df)
    df = data_encoding(df)
    df_norm = data_normalization(df)
    x = df_norm.iloc[:, :64]
    y = df_norm.iloc[:, 64:]
    x_numpy, y_numpy = x.to_numpy(), y.to_numpy()
    x_train, x_test, y_train, y_test = train_test_split(x_numpy, y_numpy, test_size=0.2, random_state=1)
    rfe_result = rfe_cv(x_train, y_train, x.columns, y.columns, LogisticRegression())

    # to pick the selected best questions as features
    # Eg: choose the best 13 features as below
    df = pd.read_csv("WID3006 ML Questionnaire.csv")
    df = data_cleaning(df)
    df = data_encoding(df)
    df_norm = data_normalization(df)
    # choose either chi2 or rfe 
    best_k_features = select_best_k_features(chi2_result, k=13) # choose chi2_result 
    best_k_features = select_best_k_features(rfe_result, k=13) # choose rfe_analysis
    x = filter_features(best_k_features, df_norm)


Hobby: Sports and Outdoors
Best number of features: 11
Score: 0.6797849462365592
1 Gender: _Male
21 Choose a pet which you prefer to keep._Horse
25 Choose a pet which you prefer to keep._Tortoise
40 What is your favorite color?_Blue
50 What is your learning style? (Pick one that benefit you the most)_By reading an e-Book
52 Do you enjoy challenges?
55 Would you prefer to engage your brain more than your body?
57 Are you a perfectionist?
58 Are you a trusting person?
59 Do you have lot of patience?
64 How confident are you in your own abilities?

Hobby: Games
Best number of features: 10
Score: 0.5944086021505377
1 Gender: _Male
4 What is your current occupation?_University student
11 I prefer to spend my money on...._The latest fashion
22 Choose a pet which you prefer to keep._I'm not a pet person
31 What do you worry more about the most?_Money
33 What do you worry more about the most?_Your family and friends
35 When you retire, you'd like to live..._Exactly where I live now
42 What is

In [12]:
def train_model(pipeline,x,y):
  f1_list=[] #To store the f1 score for each model
  pipelineList=[] #To store each model
  x_train, x_test,y_train,y_test = train_test_split(x,y, random_state=1, test_size=0.25, shuffle=True)
  for i,hobby in enumerate(y.columns):
      # Training model on train data
      pipelineList.append(deepcopy(pipeline).fit(x_train, y_train[hobby]))
      # Calculating test accuracy
      prediction = pipelineList[i].predict(x_test)
      f1=sklearn.metrics.f1_score(y_test[hobby], prediction,average="micro")
      f1_list.append(f1)
      print("F1-Micro Score for",hobby,": ",f1.round(3))
  print('F1-Macro Score: {}% (Standard deviation:{})'.format(round(statistics.mean(f1_list)*100,3),round(statistics.stdev(f1_list)*100,3)))
  return pipelineList

def predict(pipelineList,x_pred):
  x_pred=np.array(x_pred).reshape(-1,x.shape[1])
  pred=[] #To store predicted hobby
  for i,hobby in enumerate(y.columns):
    prediction = pipelineList[i].predict(x_pred)
    #Add predicted hobby
    if prediction[0]==1:
      pred.append(hobby)
  print(pred)

In [None]:
# Using pipeline for applying logistic regression/naive bayes/support vector classifier and one vs rest classifier
OneVsRestClassifier.multilabel_=True #Whether this is a multilabel classifier, does not seem to make a difference
LogReg_pipeline = Pipeline([
                ('logreg', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1))
            ])
NB_pipeline = Pipeline([
                ('nb', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None)))
            ])
SVC_pipeline = Pipeline([
                ('svc', OneVsRestClassifier(LinearSVC(), n_jobs=1))
            ])

In [None]:
#Just to get testing set
x_train, x_test,y_train,y_test = train_test_split(x,y, random_state=1, test_size=0.25, shuffle=True)

In [13]:
#Most consistent model
Logreg_pipelineList=train_model(LogReg_pipeline,x,y)
for i in range(5): 
  predict(Logreg_pipelineList,x_test.iloc[i,:])
  for j in range(len(y_test.columns)):
    print(y_test.columns[j],":",y_test.iloc[i,j])

F1-Micro Score for Sports and Outdoors :  0.604
F1-Micro Score for Games :  0.625
F1-Micro Score for Spiritual and Mental :  0.625
F1-Micro Score for Performing Arts :  0.625
F1-Micro Score for Arts and Craft :  0.792
F1-Micro Score for Food and Drinks :  0.792
F1-Micro Score for Collecting :  0.917
F1-Micro Score for Rejuvenation :  0.854
F1-Macro Score: 72.917% (Standard deviation:12.35)
['Sports and Outdoors', 'Games', 'Performing Arts', 'Food and Drinks']
Sports and Outdoors : 1.0
Games : 1.0
Spiritual and Mental : 0.0
Performing Arts : 0.0
Arts and Craft : 0.0
Food and Drinks : 1.0
Collecting : 0.0
Rejuvenation : 0.0
['Sports and Outdoors', 'Games', 'Spiritual and Mental']
Sports and Outdoors : 1.0
Games : 1.0
Spiritual and Mental : 0.0
Performing Arts : 0.0
Arts and Craft : 0.0
Food and Drinks : 1.0
Collecting : 0.0
Rejuvenation : 0.0
['Sports and Outdoors', 'Games', 'Spiritual and Mental']
Sports and Outdoors : 1.0
Games : 0.0
Spiritual and Mental : 1.0
Performing Arts : 0.0
Art

In [14]:
NB_pipelineList=train_model(NB_pipeline,x,y)
for i in range(5): 
  predict(NB_pipelineList,x_test.iloc[i,:])
  for j in range(len(y_test.columns)):
    print(y_test.columns[j],":",y_test.iloc[i,j])

F1-Micro Score for Sports and Outdoors :  0.542
F1-Micro Score for Games :  0.646
F1-Micro Score for Spiritual and Mental :  0.583
F1-Micro Score for Performing Arts :  0.625
F1-Micro Score for Arts and Craft :  0.812
F1-Micro Score for Food and Drinks :  0.729
F1-Micro Score for Collecting :  0.896
F1-Micro Score for Rejuvenation :  0.854
F1-Macro Score: 71.094% (Standard deviation:13.197)
['Sports and Outdoors', 'Games', 'Spiritual and Mental']
Sports and Outdoors : 1.0
Games : 1.0
Spiritual and Mental : 0.0
Performing Arts : 0.0
Arts and Craft : 0.0
Food and Drinks : 1.0
Collecting : 0.0
Rejuvenation : 0.0
['Sports and Outdoors', 'Games', 'Spiritual and Mental']
Sports and Outdoors : 1.0
Games : 1.0
Spiritual and Mental : 0.0
Performing Arts : 0.0
Arts and Craft : 0.0
Food and Drinks : 1.0
Collecting : 0.0
Rejuvenation : 0.0
['Sports and Outdoors', 'Games', 'Spiritual and Mental']
Sports and Outdoors : 1.0
Games : 0.0
Spiritual and Mental : 1.0
Performing Arts : 0.0
Arts and Craft :

In [15]:
SVC_pipelineList=train_model(SVC_pipeline,x,y)
for i in range(5): 
  predict(SVC_pipelineList,x_test.iloc[i,:])
  for j in range(len(y_test.columns)):
    print(y_test.columns[j],":",y_test.iloc[i,j])

F1-Micro Score for Sports and Outdoors :  0.604
F1-Micro Score for Games :  0.604
F1-Micro Score for Spiritual and Mental :  0.646
F1-Micro Score for Performing Arts :  0.604
F1-Micro Score for Arts and Craft :  0.75
F1-Micro Score for Food and Drinks :  0.812
F1-Micro Score for Collecting :  0.875
F1-Micro Score for Rejuvenation :  0.792
F1-Macro Score: 71.094% (Standard deviation:10.936)
['Sports and Outdoors', 'Games', 'Performing Arts', 'Food and Drinks']
Sports and Outdoors : 1.0
Games : 1.0
Spiritual and Mental : 0.0
Performing Arts : 0.0
Arts and Craft : 0.0
Food and Drinks : 1.0
Collecting : 0.0
Rejuvenation : 0.0
['Sports and Outdoors', 'Games', 'Spiritual and Mental']
Sports and Outdoors : 1.0
Games : 1.0
Spiritual and Mental : 0.0
Performing Arts : 0.0
Arts and Craft : 0.0
Food and Drinks : 1.0
Collecting : 0.0
Rejuvenation : 0.0
['Sports and Outdoors', 'Games', 'Spiritual and Mental']
Sports and Outdoors : 1.0
Games : 0.0
Spiritual and Mental : 1.0
Performing Arts : 0.0
Art