In [64]:
import pandas as pd
import numpy as np
from mca import diagsvd
import mca
import functools
import sklearn.model_selection as ms
from sklearn.preprocessing import StandardScaler as stdc
import gc
import _pickle as pickle
import warnings
from gensim import corpora, models, similarities
from gensim.corpora import SvmLightCorpus
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn import metrics
warnings.filterwarnings('ignore')

categorical = ['source_system_tab','source_screen_name','source_type','city','genre_ids','language']
numerical = ['song_length', 'song_year']

# Partitioning

In [46]:
file = 'df_clean.csv'
#categorical = ['source_system_tab','source_type','gender','city','genre_ids','language']
X = pd.read_csv('../Data/' + file,nrows=None, dtype={'target': np.uint8,'city':'category',
                                                     'source_system_tab':'category',
                                                     'source_type':'category','source_screen_name':'category',
                                                     'song_year':np.uint32,
                                                     'song_length':np.uint32,
                                                     'language':'category','genre_ids':'category'})
X = X.drop(columns='Unnamed: 0')
y = X['target']

X_num = X[numerical]

print(X.dtypes)

X.head()

source_system_tab     category
source_screen_name    category
source_type           category
target                   uint8
city                  category
song_length             uint32
genre_ids             category
language              category
song_year               uint32
dtype: object


Unnamed: 0,source_system_tab,source_screen_name,source_type,target,city,song_length,genre_ids,language,song_year
0,explore,Explore,online-playlist,1,1,206471,359,52.0,2016
1,my library,Local playlist more,local-playlist,1,13,284584,1259,52.0,1999
2,my library,Local playlist more,local-playlist,1,13,225396,1259,52.0,2006
3,my library,Local playlist more,local-playlist,1,13,255512,1019,-1.0,2010
4,explore,Explore,online-playlist,1,1,187802,1011,52.0,2016


In [3]:
DummiesX = pd.get_dummies(data=X[categorical])
print(DummiesX.shape)
DummiesX.head()

(6317409, 219)


Unnamed: 0,source_system_tab_discover,source_system_tab_explore,source_system_tab_listen with,source_system_tab_my library,source_system_tab_notification,source_system_tab_radio,source_system_tab_search,source_system_tab_settings,source_screen_name_Album more,source_screen_name_Artist more,...,language_-1.0,language_10.0,language_17.0,language_24.0,language_3.0,language_31.0,language_38.0,language_45.0,language_52.0,language_59.0
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Utilizamos SVD para reducir la dimensión de 219 features a 50. Conservando toda la varianza (que de más de uno 
debe ser por errores de aproximación)

In [35]:
num_topics = 50
lsa = TruncatedSVD(num_topics,algorithm='randomized')
dtm_lsa = lsa.fit_transform(DummiesX)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)


In [41]:
print(lsa.explained_variance_ratio_.sum())  

1.1018009413749132


In [61]:
names = ['V' + str(i) for i in range(num_topics)]
X_cat = pd.DataFrame(dtm_lsa, columns=names, index = DummiesX.index)

X_num = pd.DataFrame(stdc().fit_transform(X_num.values), columns=X_num.columns, index = X_num.index)

In [55]:
concat_Data = pd.concat([X_cat, X_num], axis=1, ignore_index=False)
concat_Data['target'] = y
concat_Data.isnull().sum().sum()

0

In [65]:
(X_train, X_test, y_train, y_test) = ms.train_test_split(concat_Train, y, test_size=proportion, random_state=23)

X_train['target'] = y_train
X_test['target'] = y_test

In [59]:
X_test.head()

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V43,V44,V45,V46,V47,V48,V49,song_length,song_year,target
3602144,-0.941515,-0.131256,0.041048,-0.102479,0.174487,-0.214058,-0.031531,-0.034653,-0.031653,0.024846,...,0.00017,0.000679,-0.001147,0.00055,-0.000351,0.000659,0.000347,0.297322,-0.334019,1
3902875,-0.222334,0.018545,-0.42191,-0.376497,0.225664,0.467983,-0.414701,-0.174638,-0.008017,0.003003,...,-0.003086,0.001483,-0.006814,-0.009493,0.007091,-0.008804,0.005619,0.508692,-0.63463,0
4828208,-0.387945,0.696119,-0.549554,-0.069279,0.032984,-0.108308,0.018571,-0.075178,-0.11803,0.01635,...,-0.000927,-0.003439,-0.006118,-0.007567,0.002611,-0.003348,0.006312,-0.626501,0.116896,0
1510794,-0.827295,-0.177145,-0.442274,0.046007,0.20155,-0.109603,-0.079815,0.045214,-0.113027,0.010077,...,-0.001109,-0.000542,-0.004258,-0.005686,0.003594,-0.000639,0.003254,-0.504602,0.267201,1
2476575,-0.470146,-0.315261,-0.041768,0.294811,-0.048933,-0.142824,-0.110002,-0.012957,0.538083,-0.128069,...,-0.000945,-0.004217,-0.001875,-0.006164,0.00615,-0.001571,-0.000685,-0.466412,0.116896,0


In [60]:
X_train.to_csv('../Data/clean_train.csv')
X_test.to_csv('../Data/clean_test.csv')