In [2]:
import pandas as pd
import numpy as np
from mca import diagsvd
import mca
import functools
import sklearn.model_selection as ms
from sklearn.preprocessing import StandardScaler as stdc
import gc
import _pickle as pickle
import warnings
from gensim import corpora, models, similarities
from gensim.corpora import SvmLightCorpus
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn import metrics
warnings.filterwarnings('ignore')

categorical = ['source_system_tab','source_screen_name','source_type','city','genre_ids','language']
numerical = ['song_length', 'song_year']

# Partitioning

In [3]:
file = 'df_clean.csv'
#categorical = ['source_system_tab','source_type','gender','city','genre_ids','language']
X = pd.read_csv('../Data/' + file,nrows=None, dtype={'target': np.uint8,'city':'category',
                                                     'source_system_tab':'category',
                                                     'source_type':'category','source_screen_name':'category',
                                                     'song_year':np.uint32,
                                                     'song_length':np.uint32,
                                                     'language':'category','genre_ids':'category'})
X = X.drop(columns='Unnamed: 0')
y = X['target']

X_num = X[numerical]

print(X.dtypes)

X.head()

source_system_tab     category
source_screen_name    category
source_type           category
target                   uint8
city                  category
song_length             uint32
genre_ids             category
language              category
song_year               uint32
dtype: object


Unnamed: 0,source_system_tab,source_screen_name,source_type,target,city,song_length,genre_ids,language,song_year
0,explore,Explore,online-playlist,1,1,206471,359,52.0,2016
1,my library,Local playlist more,local-playlist,1,13,284584,1259,52.0,1999
2,my library,Local playlist more,local-playlist,1,13,225396,1259,52.0,2006
3,my library,Local playlist more,local-playlist,1,13,255512,1019,-1.0,2010
4,explore,Explore,online-playlist,1,1,187802,1011,52.0,2016


In [4]:
DummiesX = pd.get_dummies(data=X[categorical])
print(DummiesX.shape)
DummiesX.head()

(6317409, 219)


Unnamed: 0,source_system_tab_discover,source_system_tab_explore,source_system_tab_listen with,source_system_tab_my library,source_system_tab_notification,source_system_tab_radio,source_system_tab_search,source_system_tab_settings,source_screen_name_Album more,source_screen_name_Artist more,...,language_-1.0,language_10.0,language_17.0,language_24.0,language_3.0,language_31.0,language_38.0,language_45.0,language_52.0,language_59.0
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Utilizamos SVD para reducir la dimensión de 219 features a 50. Conservando toda la varianza (que de más de uno 
debe ser por errores de aproximación)

In [5]:
num_topics = 50
lsa = TruncatedSVD(num_topics,algorithm='randomized')
dtm_lsa = lsa.fit_transform(DummiesX)
dtm_lsa = stdc(copy=False).fit_transform(dtm_lsa)


In [6]:
print(lsa.explained_variance_ratio_.sum())  

1.082826831010248


In [7]:
names = ['V' + str(i) for i in range(num_topics)]
X_cat = pd.DataFrame(dtm_lsa, columns=names, index = DummiesX.index)

X_num = pd.DataFrame(stdc().fit_transform(X_num.values), columns=X_num.columns, index = X_num.index)

In [8]:
concat_Data = pd.concat([X_cat, X_num], axis=1, ignore_index=False)
concat_Data['target'] = y
concat_Data.isnull().sum().sum()

0

In [11]:
proportion = 0.3
(X_train, X_test, y_train, y_test) = ms.train_test_split(concat_Data, y, test_size=proportion, random_state=23)

X_train['target'] = y_train
X_test['target'] = y_test

In [12]:
X_test.head()

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V43,V44,V45,V46,V47,V48,V49,song_length,song_year,target
3602144,-1.688448,-0.649602,0.332273,-0.276194,-1.040063,-1.289472,0.390882,-0.650481,-0.078829,0.364443,...,0.049718,-0.017771,0.038996,0.027822,0.03703,0.029397,-0.00869,0.297322,-0.334019,1
3902875,0.877626,-0.301281,-1.469993,-1.809987,-0.671815,2.319192,1.976077,0.180743,-0.778351,-0.798601,...,-0.072389,0.181824,0.243468,-0.170484,0.140233,-0.148944,0.107107,0.508692,-0.63463,0
4828208,0.290423,1.653103,-1.835152,-0.203297,-0.268591,-0.657132,-0.038694,-0.947103,-0.520532,0.384093,...,0.076584,0.059532,0.222048,-0.140279,0.09588,-0.157098,-0.116823,-0.626501,0.116896,0
1510794,-1.212134,-0.868077,-1.56607,0.161924,-1.099286,-0.882219,0.161216,-0.117033,-0.721014,0.39699,...,-0.114333,0.089979,0.256054,-0.061262,0.063215,-0.036665,0.027959,-0.504602,0.267201,1
2476575,0.017468,-1.165744,-0.178042,1.46213,0.353477,-0.610327,1.053984,0.778455,2.877859,-2.350902,...,-0.324452,-0.107734,0.176263,-0.178007,0.091216,-0.234435,0.157991,-0.466412,0.116896,0


In [13]:
X_train.to_csv('../Data/clean_train.csv')
X_test.to_csv('../Data/clean_test.csv')