In [1]:
import os
import time
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore') 

In [19]:
class Data:
    def __init__(self, txt_path = None, csv_path = None):
        self.txt_path = txt_path if txt_path else '../data/LFM/txt/'
        self.csv_path = csv_path if csv_path else '../data/LFM/csv/'
        
    def txt_to_csv(self, filename, index_col = None, to_drop_col = None, to_drop_row = None):
        if index_col:
            df = pd.read_csv(os.path.join(self.txt_path, filename), index_col = index_col, delimiter = '\t')
        else:
            pd.read_csv(os.path.join(self.txt_path, filename), delimiter = '\t')
        if to_drop_col:
            df = df.drop(to_drop_col, axis = 1)
        if to_drop_row:
            df = df.dropna(subset = to_drop_row)
        return df

    def join_df(self, df_1, df_2, on, how = ''):
        how = how if how else 'left'
        return df_1.join(df_2, on = on, how = how, lsuffix = '_left')        
    
    def seperate_train_text(self, df, criteria):
        train = df[df['gender'].isin(criteria)]
        test = df[~df['gender'].isin(criteria)]
        return train, test
    def undersample():
        None
    def over_sample(self, df):
        classes = df['gender'].value_counts().to_dict()
        most = max(classes.values())
        classes_list = []
        for key in classes:
            classes_list.append(df[df['gender'] == key]) 
        classes_sample = []
        for i in range(1,len(classes_list)):
            classes_sample.append(classes_list[i].sample(most, replace=True))
        df_maybe = pd.concat(classes_sample)
        final_df = pd.concat([df_maybe,classes_list[0]], axis=0)
        final_df = final_df.reset_index(drop=True)
        return final_df
        
    def save_csv(self, df, csv_name):
        df.to_csv(os.path.join(self.csv_path, csv_name))
        print('File saved at:', os.path.join(self.csv_path, csv_name))

In [39]:
data = Data()
df_users = data.txt_to_csv('LFM-1b_users.txt', 
                      index_col = 'user_id', 
                      to_drop_col = ['playcount', 'registered_unixtime'],
                      to_drop_row = ['country'])
df_users_additional_data_w = data.txt_to_csv('LFM-1b_user_count_allmusic_w.txt', index_col = 'user_id')
df_users_additional_data_nw = data.txt_to_csv('LFM-1b_user_count_allmusic_nw.txt', index_col = 'user_id')
df_users_additional_data = data.join_df(df_users_additional_data_w, df_users_additional_data_w, 'user_id', how = 'inner')
df = data.join_df(df_users, df_users_additional_data, 'user_id', how = 'inner')
train, test = data.seperate_train_text(df, ['m', 'f'])
train = data.over_sample(train)
data.save_csv(train, 'lfm_train.csv')
data.save_csv(test, 'lfm_test.csv')

File saved at: ../data/LFM/csv/lfm_train.csv
File saved at: ../data/LFM/csv/lfm_test.csv


In [40]:
def eda(df):
    print(df['gender'].value_counts())
df = pd.read_csv('../data/LFM/csv/lfm_train.csv')
eda(df)

f    36472
m    36472
Name: gender, dtype: int64


In [59]:
class Prep_Data:
    def __init__(self, df_name, to_drop, normalize, normalize_cols, csv_name, train = False, csv_path = None):
        self.csv_path = csv_path if csv_path else '../data/LFM/csv/'
        df = pd.read_csv(os.path.join(self.csv_path, df_name))
        df = self.drop_cols(df, to_drop)
        df = self.normalize(df, normalize, normalize_cols)
        if train:
            df, to_drop = self.remove_corr(df, normalize_cols)
            df = self.translate_target(df)
        self.save_data(df, csv_name)
    
    def drop_cols(self, df, to_drop):
        return df.drop(to_drop, axis = 1)

    def normalize(self, df, normalize, normalize_cols):
        if (normalize == 'minmax'):
            scaler = MinMaxScaler()
        elif (normalize == 'std'):
            scaler = StandardScaler()
        else:
            scaler = Normalizer()
        df[normalize_cols] = scaler.fit_transform(df[normalize_cols])
        return df
    
    def remove_corr(self, df, X):
        cor_matrix = df[X].corr().abs()
        upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k = 1).astype(np.bool))
        to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.97)]
        print(to_drop)
        df = df.drop(to_drop, axis = 1)
        return df, to_drop
    
    def translate_target(self, df):
        df['gender'] = df['gender'].replace({'f': 0, 'm': 1})
        return df
        
    def save_data(self, df, csv_name):
        df.to_csv(os.path.join(self.csv_path, csv_name))
        print('File saved at:', os.path.join(self.csv_path, csv_name))
        

In [62]:
# X = ['rnb', 'rap', 'electronic', 
#      'rock', 'new age', 'classical',
#      'reggae', 'blues', 'country', 'world',
#      'folk', 'easy listening', 'jazz', 'vocal', 
#      'children\'s', 'punk', 'alternative', 'spoken word', 
#      'pop', 'heavy metal']
# X_after = ['rnb', 'rap', 'electronic', 
#      'rock', 'new age', 'classical',
#      'reggae', 'blues', 'country', 'world',
#      'folk', 'easy listening', 'jazz', 'vocal', 
#      'children\'s', 'punk', 'spoken word', 
#      'pop', 'heavy metal']
X = ['rnb', 'rap', 'electronic', 
     'rock', 'new age', 'classical',
     'reggae', 'blues', 'country', 'world',
     'folk', 'easy listening', 'jazz', 'vocal', 
     'children\'s', 'punk', 'alternative', 'spoken word', 
     'pop', 'heavy metal',
     'rnb_left', 'rap_left', 'electronic_left', 
     'rock_left', 'new age_left', 'classical_left',
     'reggae_left', 'blues_left', 'world_left',
     'folk_left', 'easy listening_left', 'jazz_left', 'vocal_left', 
     'children\'s_left', 'punk_left', 'alternative_left', 'spoken word_left', 
     'pop_left', 'heavy metal_left']
X_after = ['rnb', 'rap', 'electronic', 
     'rock', 'new age', 'classical',
     'reggae', 'blues', 'country', 'world',
     'folk', 'easy listening', 'jazz', 'vocal', 
     'children\'s', 'punk', 'alternative', 'spoken word', 
     'pop', 'heavy metal']

In [61]:
Prep_Data('lfm_train.csv', 
          to_drop = ['country_left', 'age'], 
          normalize = 'std', 
          normalize_cols = X,
          csv_name = 'lfm_train_normalize.csv',
          train = True)

Prep_Data('lfm_test.csv', 
          to_drop = ['country_left', 'age'], 
          normalize = '', 
          normalize_cols = X,
          csv_name = 'lfm_test_normalize.csv')

['rnb_left', 'rap_left', 'electronic_left', 'rock_left', 'new age_left', 'classical_left', 'reggae_left', 'blues_left', 'world_left', 'folk_left', 'easy listening_left', 'jazz_left', 'vocal_left', "children's_left", 'punk_left', 'alternative_left', 'spoken word_left', 'pop_left', 'heavy metal_left']
File saved at: ../data/LFM/csv/lfm_train_normalize.csv
File saved at: ../data/LFM/csv/lfm_test_normalize.csv


<__main__.Prep_Data at 0x7fa14d5b6cd0>

In [66]:
class Train:
    def __init__(self, df_name, X, y, model_name, model_type = 'logistic_regression', model_path = None, csv_path = None):
        self.model_type_dict = {
            'logistic_regression': LogisticRegression(),
            'sgd_classifier': SGDClassifier(),
            'random_forest': RandomForestClassifier(n_estimators = 1000),
            'linear_svc': LinearSVC(),
            'svc': SVC(),
            'decision_tree': DecisionTreeClassifier(),
            'nn': MLPClassifier()
        }
        
        self.model_path = model_path if csv_path else '../model/'
        self.csv_path = csv_path if csv_path else '../data/LFM/csv/'
        if not os.path.isdir(self.model_path):
            os.makedir(self.model_path)
        df = pd.read_csv(os.path.join(self.csv_path, df_name))
        train_X, test_X, train_y, test_y = self.train_test_split(df, X, y)
        self.evaluate(df, X, y, train_X, test_X, train_y, test_y, model_type)
        model = self.train(df[X], df[y], model_type)
        self.save_model(model, model_name, self.model_path)
        
    def train_test_split(self, df, X, y, split = 0.05):
        return train_test_split(df[X], df[y], test_size = split, shuffle = True)
    
    def evaluate(self, df, X, y, train_X, test_X, train_y, test_y, model_type):
        model = self.model_type_dict[model_type]
        model.fit(train_X, train_y)
        # model.fit(df[X], df[y])
        print(model_type, 'has an accuracy of:', model.score(test_X, test_y))
    
    def train(self, X, y, model_type):
        start = time.time()
        model = self.model_type_dict[model_type]
        model.fit(X, y)
        print('Model training took {}s'.format(time.time() - start))
        return model
        
    def save_model(self, model, model_name, model_path):
        pickle.dump(model, open(os.path.join(model_path, model_name), 'wb'))
        print('Model saved at:',  os.path.join(model_path, model_name))

In [67]:
Train('lfm_train_normalize.csv',
      X = X_after,
      y = 'gender',
      model_name = 'model.pkl',
      model_type = 'random_forest')

random_forest has an accuracy of: 0.9311951754385965
Model training took 179.51963901519775s
Model saved at: ../model/model.pkl


<__main__.Train at 0x7fa14d597850>