# Analyse supervisée

## Import des données

In [1]:
# Librairie pandas (manipulation de données csv, dataframe, etc.)
import pandas as pd

# Import et lecture du corpus :
corpus = pd.read_csv('corpus_nettoye.csv')
corpus.head()

Unnamed: 0.1,Unnamed: 0,Song,Album Debut,Songwriter(s),Lead Vocal(s),Year,Lyrics
0,1,"""Across the Universe""",Let It Be,Lennon,Lennon,1968,Words are flowing out like endless rain into a...
1,2,"""All I've Got to Do""",UK: With the Beatles\n US: Meet the Beatles!,Lennon,Lennon,1963,Whenever I want you around yeh All I gotta do...
2,3,"""All My Loving""",UK: With the Beatles\n US: Meet the Beatles!,McCartney,McCartney,1963,Close your eyes and I'll kiss you Tomorrow I'l...
3,5,"""All Together Now""",Yellow Submarine,Lennon-McCartney,McCartney,1967,One two three four Can I have a little more Fi...
4,6,"""All You Need Is Love""",Magical Mystery Tour,Lennon,Lennon,1967,"Love, love, love Love, love, love Love, love, ..."


## Analyse supervisée


Pour cette partie, nous allons réutiliser du code stocké dans un package, et, pour ce faire, nous couler dans le moule attendu par ce package. Commençons par créer le dossier avec les données nécessaires.


In [2]:
# Turning the corpus into the expected format for our functions
import jagen_will.preproc.tuyau as tuy
import jagen_will.preproc.features_extract as fex
from jagen_will.preproc.text_count import count_process
import os

myTexts = []


### HACK: j'écris les données dont on a besoin pour l'usage prévu du module
# Create folder structure
if not os.path.exists('./data/songs_text/train/'):
    os.makedirs("./data/songs_text/train/")

if not os.path.exists('./data/songs_text/test/'):
    os.makedirs("./data/songs_text/test/")
    
# Start by writing fulltext output
for index, row in corpus.iterrows():
    if isinstance(row["Lyrics"], str) and row["Lyrics"] is not "":
        if row["Songwriter(s)"] in ['McCartney', 'Lennon']:
            folder = 'train/'
        else:
            folder = 'test/'
        
        # Fulltext
        with open("data/songs_text/"+folder+row["Songwriter(s)"]+"_"+row["Song"].replace('"', '').replace(' ','-')+'.txt', 'w') as f:
            f.write(row["Lyrics"])
            
        # TODO: write a version with lemma and POS
        # Lemmatised
        # POS  


and now we can try to analyse them.

In [3]:
# Create the table with data to be analysed
# Either run this from command line
# python main.py -t chars -n 3 -s data/songs_text/train/* #chars 3-grams
# python main.py -t chars -n 3 -f feature_list_chars3grams5000mf.json -s data/songs_text/test/* *# feats common with train
# OR 
# the following bloc of code

import sys
import os
import jagen_will.preproc.tuyau as tuy
import jagen_will.preproc.features_extract as fex
from jagen_will.preproc.text_count import count_process
import fasttext
import pandas
import json
# from multiprocessing import Pool
from multiprocessing.pool import ThreadPool as Pool
import tqdm
import glob

model = fasttext.load_model("jagen_will/preproc/models/lid.176.bin")
paths_train = glob.glob("data/songs_text/train/*")
paths_test = glob.glob("data/songs_text/test/*")
arg_feats="chars"
arg_n=3

#myTexts
# corpus_clean_select
myTexts = tuy.load_texts(paths_train, model, format='txt')
# Get features

my_feats = fex.get_feature_list(myTexts, feats=arg_feats, n=arg_n, relFreqs=True)
mf = 3000
if mf < len(my_feats):
    val = my_feats[mf][1]
    my_feats = [m for m in my_feats if m[1] >= val]

with open("feature_list.json", "w") as out:
    out.write(json.dumps(my_feats))
    
feat_list = [m[0] for m in my_feats]
myTexts = fex.get_counts(myTexts, feat_list=feat_list, feats=arg_feats, n=arg_n, relFreqs=True)

unique_texts = [text["name"] for text in myTexts]

loc = {}

for t in tqdm.tqdm(myTexts):
    text, local_freqs = count_process((t, feat_list))
    loc[text["name"]] = local_freqs
    
# Saving metadata for later
metadata = pandas.DataFrame(columns=['author', 'lang'], index=unique_texts, data =
                                [[t["aut"], t["lang"]] for t in myTexts])
# Free some space before doing this...
del myTexts

feats = pandas.DataFrame.from_dict(loc, columns=list(feat_list), orient="index")
pandas.concat([metadata, feats], axis=1).to_csv("feats_train.csv")


## And now test
myTexts = tuy.load_texts(paths_test, model, format='txt')
myTexts = fex.get_counts(myTexts, feat_list=feat_list, feats=arg_feats, n=arg_n, relFreqs=True)
unique_texts = [text["name"] for text in myTexts]
loc = {}

for t in tqdm.tqdm(myTexts):
    text, local_freqs = count_process((t, feat_list))
    loc[text["name"]] = local_freqs

metadata = pandas.DataFrame(columns=['author', 'lang'], index=unique_texts, data =
                                [[t["aut"], t["lang"]] for t in myTexts])
    
del myTexts

feats = pandas.DataFrame.from_dict(loc, columns=list(feat_list), orient="index")
del loc

pandas.concat([metadata, feats], axis=1).to_csv("feats_test.csv")

100%|██████████| 119/119 [00:00<00:00, 280.47it/s]
100%|██████████| 60/60 [00:00<00:00, 279.24it/s]


Now, we have created the data, we can run, for instance, a **leave one out** analysis 

In [4]:
# either run from the command line
# python train_svm.py feats_train_n3_k_5000.csv --leave_one_out --norms
# OR
# the following code bloc
import sys 
import jagen_will.svm
import pandas
import joblib
train = pandas.read_csv("feats_train.csv", index_col=0)
test = None

svm = jagen_will.svm.train_svm(train, test, leave_one_out=True, norms=True, kernel="LinearSVC")

.......... Formatting data ........
.......... Creating pipeline according to user choices ........
.......... using normalisations ........
.......... choosing SVM ........
.......... Creating pipeline with steps ........
[('scaler', StandardScaler()), ('normalizer', Normalizer()), ('model', LinearSVC())]
.......... leave-one-out cross validation will be performed ........
.......... using 119 samples ........
              precision    recall  f1-score   support

      Lennon       0.56      0.57      0.56        60
   McCartney       0.55      0.54      0.55        59

    accuracy                           0.55       119
   macro avg       0.55      0.55      0.55       119
weighted avg       0.55      0.55      0.55       119



And now we can go for the final analysis…

In [6]:
# Same as usual, either run from the command line
# python train_svm.py feats_train_n3_k_5000.csv --test_path feats_test_n3_k_5000.csv --norms --final
# OR
# the following code bloc
import jagen_will.svm
import pandas
import joblib
train = pandas.read_csv("feats_train.csv", index_col=0)
test = pandas.read_csv("feats_test.csv", index_col=0)

svm = jagen_will.svm.train_svm(train, test, norms=True, kernel="LinearSVC", final_pred=True)

.......... Formatting data ........
.......... Creating pipeline according to user choices ........
.......... using normalisations ........
.......... choosing SVM ........
.......... Creating pipeline with steps ........
[('scaler', StandardScaler()), ('normalizer', Normalizer()), ('model', LinearSVC())]
                  precision    recall  f1-score   support

          Lennon       0.00      0.00      0.00       0.0
Lennon-McCartney       0.00      0.00      0.00      60.0
       McCartney       0.00      0.00      0.00       0.0

        accuracy                           0.00      60.0
       macro avg       0.00      0.00      0.00      60.0
    weighted avg       0.00      0.00      0.00      60.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
