In [22]:
import pandas as pd

In [23]:
df1 = pd.read_csv('save_frames_artists/bob_dylan_no_index.csv') 
df2 = pd.read_csv('save_frames_artists/eminem_no_index.csv') 

In [24]:
a = df1[['song_lyric','artist']]
b = df2[['song_lyric','artist']]

In [25]:
def fix_artist_frame(df,artist_name):
    #  Split songs in sentences by creating new rows for each artist 
    lyrics = []
    for i in range(len(df.song_lyric)):
        rows = df.song_lyric[i].split("\n")                                      
        rows_stripped = [rows[n].strip() for n in range(len(rows))]         
        lyrics.append(rows_stripped)   
    artist = pd.DataFrame(lyrics).melt()  
    #artist =artist.value
    artist.dropna( axis=0, inplace=True)
    artist['artist_name'] = artist_name
    artist.drop('variable',axis=1 , inplace = True)
    artist.rename(columns = {'value' : 'lyrics'}, inplace = True)
    return artist

bob_dylan = fix_artist_frame(df1,'bob_dylan')
eminem = fix_artist_frame(df2,'eminem')
final= pd.concat([bob_dylan, eminem], ignore_index=True)

In [26]:
# Save data workspace!
import os  
os.makedirs('save_frames_artists', exist_ok=True)  
bob_dylan.to_csv('save_frames_artists/bob_dylan_clean.csv',index=False) 
eminem.to_csv('save_frames_artists/eminem_clean.csv',index=False)
final.to_csv('save_frames_artists/lyrics_clean.csv',index=False)  

# Start the project form the frame!

In [27]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
df = pd.read_csv('save_frames_artists/lyrics_clean.csv')

In [29]:
df.artist_name.loc[df.artist_name == 'bob_dylan'] =0
df.artist_name.loc[df.artist_name == 'eminem'] =1


In [30]:
X = pd.DataFrame(df['lyrics'])
y = df.artist_name

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33,random_state=42)
y_test=y_test.astype(int)
y_train=y_train.astype(int)

In [32]:
def Countvect_fit_transf(trainX, trainY): # sample is the df.song_lyric[:2]
    vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1, 3))
    X = vectorizer.fit_transform(trainX.apply(lambda x: np.str_(x)))    
    X_cv= pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names(),index = trainY)


    return X_cv , vectorizer

def Countvect_transf(testX, testY, self_vectorizer): # sample is the df.song_lyric[:2]
    X = self_vectorizer.transform(testX.apply(lambda x: np.str_(x)))    
    X_cv= pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names(),index = testY)

    return X_cv

def Countvect_transf_input(inputX, self_vectorizer): # sample is the df.song_lyric[:2]
    X = self_vectorizer.transform([inputX])    
    X_cv= pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())

    return X_cv


In [33]:
input_lyr = input('Give me your lyrics: ')

In [34]:
frame_cv_train, vectorizer =Countvect_fit_transf(X_train['lyrics'], y_train)
frame_cv_test =Countvect_transf(X_test['lyrics'], y_test, vectorizer)
frame_cv_input =Countvect_transf_input(input_lyr, vectorizer)



In [35]:
frame_cv_train.shape, frame_cv_test.shape, frame_cv_input.shape, y_test.shape, y_train.shape


((4829, 24264), (2379, 24264), (1, 24264), (2379,), (4829,))

# Model train

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

In [39]:

def pipe_line(model,frame_cv_train, y_train,frame_cv_test, y_test,frame_cv_input,input_lyr):
    if model == 'LogisticRegression':
        pipe = Pipeline([('LogisticRegression', LogisticRegression(class_weight='balanced_subsample'))])
        lr = pipe.fit(frame_cv_train, y_train)  
        train_accuracy = lr.score(frame_cv_train, y_train)
        test_accuracy = lr.score(frame_cv_test, y_test)
        ypred = lr.predict(frame_cv_input) 
        prediction = ypred
        if prediction == 0:
            name = 'Bob_Dylan'
        else:
            name = 'Eminem'
        print(f'The model gives that the \'{input_lyr}\' was said by {name}, with accuracy = {round(test_accuracy,2)*100}%. ')
    elif model == 'RandomForest':
        pipe = Pipeline([('RandomForest', RandomForestClassifier(max_depth=3, n_estimators=10, random_state=42, class_weight='balanced'))])
        parameter_grid = {
                        'RandomForest__max_depth' : [2,3,4,5,6]
                        }
        gridsearch = GridSearchCV(pipe, 
                          parameter_grid, 
                          scoring=None,
                          verbose=3)
        gridsearch.fit(frame_cv_train, y_train)
        gridsearch.best_estimator_.score(frame_cv_test, y_test)
        ypred = gridsearch.best_estimator_.predict(frame_cv_test)
        accuracy_score(ypred, y_test)
        prediction = ypred[0]
        if prediction == 0:
            name = 'Bob_Dylan'
        else:
            name = 'Eminem'
        print(f'The model gives that the \'{input_lyr}\' was said by {name}, with accuracy = {round(accuracy_score(ypred, y_test),2)*100}%.')
    else:
        print('That is not a valid name_model!! Try between \'LogisticRegression\', \'RandomForest\' and \'NEW\'.')
   

In [40]:

input_model = str(input('Give me which model I should use between \'LogisticRegression\', \'RandomForest\' and \'NEW\'.'))
pipe_line(input_model,frame_cv_train, y_train,frame_cv_test, y_test,frame_cv_input,input_lyr)  


Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END .........RandomForest__max_depth=2;, score=0.405 total time=   3.4s
[CV 2/5] END .........RandomForest__max_depth=2;, score=0.608 total time=   2.4s
[CV 3/5] END .........RandomForest__max_depth=2;, score=0.611 total time=   1.7s
[CV 4/5] END .........RandomForest__max_depth=2;, score=0.445 total time=   1.7s
[CV 5/5] END .........RandomForest__max_depth=2;, score=0.616 total time=   1.6s
[CV 1/5] END .........RandomForest__max_depth=3;, score=0.411 total time=   1.9s
[CV 2/5] END .........RandomForest__max_depth=3;, score=0.630 total time=   1.7s
[CV 3/5] END .........RandomForest__max_depth=3;, score=0.615 total time=   1.7s
[CV 4/5] END .........RandomForest__max_depth=3;, score=0.452 total time=   1.7s
[CV 5/5] END .........RandomForest__max_depth=3;, score=0.631 total time=   1.7s
[CV 1/5] END .........RandomForest__max_depth=4;, score=0.616 total time=   2.1s
[CV 2/5] END .........RandomForest__max_depth=4;,

array([1, 1, 1, ..., 1, 1, 1])

In [41]:
# save model in notebook
import pickle

with open('my_estimator.pkl','wb') as my_file:
    pickle.dump(my_estimator, my_file)


with open('my_estimator.pkl','rb') as my_file:
    my_estimator = pickle.load(my_file)


my_estimator.predict_proba(X_test) #gives probability of X_test is 1

NameError: name 'my_estimator' is not defined