### Otimizando dados 

In [15]:
import pandas as pd
import numpy as np

df = pd.read_csv(r'C:\Users\Pedro\Desktop\Estudos\Databases\vgsales.csv')

df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [None]:
df.info()

In [None]:
# para otimizar o consumo de memoria podemos especificar as colunas do dataset que queremos no pd.read_csv()

df = pd.read_csv(r'C:\Users\Pedro\Desktop\Estudos\Databases\vgsales.csv',
        usecols = ['Rank', 'Name', 'Global_Sales', 'NA_Sales', 'EU_Sales', 'Other_Sales','JP_Sales', 'Year'])
df.head()

In [None]:
# mudando alguns tipo de dados para otimizar a importação 

df['Year'] = df['Year'].astype('float32')

In [None]:
df.info()

In [None]:
### Importar especificando os tipos de dados de cada coluna

df_specify_columns_dtypes = pd.read_csv(r'C:\Users\Pedro\Desktop\Estudos\Databases\vgsales.csv',
                                 dtype = {
    'Global_Sales':np.dtype('float32'),
    'NA_Sales':np.dtype('float32'),
    'EU_Sales' : np.dtype('float32'),
    'JP_Sales' : np.dtype('float32')
})

In [None]:
df_specify_columns_dtypes.info()

In [None]:
# Podemos usar também o chunksize na importação do dados para otimizar

chunk = pd.read_csv(r'C:\Users\Pedro\Desktop\Estudos\Databases\vgsales.csv', chunksize = 1000)

df_1000 = pd.concat(chunk)
df_1000.head(5)

In [None]:
# podemos especficar o número de linhas que queremos como amostra

df_amostra = pd.read_csv(r'C:\Users\Pedro\Desktop\Estudos\Databases\housing.csv', nrows = 100)
df_amostra.shape

In [None]:
# Criar amostra com o método sample

amostra = df.sample(100, random_state = 0)
amostra.shape

In [None]:
# biblioteca que identifica os tipos de dados das colunas adequados

from dtype_diet import report_on_dataframe
from dtype_diet import optimize_dtypes

 # Identifica os tipos de dados ideiais 
    
df_proposed = report_on_dataframe(df, unit = 'MB')
df_proposed

In [None]:
# substituir os tipos de dados originais pelos tipos propostos pelo report_on_dataframe

new_df = optimize_dtypes(df, df_proposed)

In [None]:
new_df.info()

In [None]:
new_df.head()

### Pandas Modin

O pandas modin é uma biblioteca que permite o uso de todos os core da sua máquina, aumentando a velocidade de execução do pandas

In [20]:
import modin.pandas as pd_modin
df_modin = pd_modin.read_csv(r'C:\Users\Pedro\Desktop\Estudos\Databases\vgsales.csv')

df_modin.info()

<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


In [23]:
%%time
df_modin.head()

CPU times: total: 0 ns
Wall time: 1.51 ms


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [27]:
%%time

df_pandas = pd.read_csv(r'C:\Users\Pedro\Desktop\Estudos\Databases\vgsales.csv')

CPU times: total: 0 ns
Wall time: 27.2 ms
