In [1]:
import pandas as pd
import numpy as np
from random import randint, choice, random, randrange
import datetime as dt
from random import randint
from faker import Faker
fake = Faker()
fake.seed_instance(4321)

In [2]:
np.random.choice(['apple', 'banana', 'carrot'], 10)
np.random.RandomState = 100
np.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0])

def create_df(rows):
    rng = range(rows)

    country_list = ['USA', 'Germany', 'Japan', 'India'] + list({fake.unique.country() for _ in range(10)})
    company_list = list({fake.company() for _ in range(10)})[:3]
    date_of_births = [fake.date_of_birth().replace(year=randint(1968, 1990)) for i in rng]
    ages = [int((dt.date.today() - date).days / 365) for date in date_of_births]
    employee_number = [randrange(100_000, 999999) for _ in rng]
    departments = ['Consulting', 'Developer', 'Finance', 'System Architect', 'Management']

    d = {
        'employee_number': employee_number,
        'name': [fake.name() for i in rng],
        'company': [choice(company_list) for i in rng],
        'country': [choice(country_list) for i in rng],
        'dob': date_of_births,
        'age': ages,
        'department': [choice(departments) for _ in rng],
        'salary': [randrange(75_000, 135_000) for _ in rng],
        'has_parking_space': [choice([True, False]) for _ in rng]
    }
    
    df = pd.DataFrame(d)
    df.drop_duplicates(subset=['employee_number'], inplace=True)
    df.drop_duplicates(subset=['name'], inplace=True)
    
    # change dob to datetime
    df.dob = pd.to_datetime(df.dob)

    return df.head(100)

def save_as_different_formats(df):
    path = 'datasets/data'
    
    # excel
    writer = pd.ExcelWriter(path=path+'.xlsx')
    df.to_excel(writer, index=False)
    writer.save()
    writer.close()
    
    # csv
    df.to_csv(path_or_buf=path+'.csv', index=False)
    
    # parquet
    df.to_parquet(path+'.parquet')

In [48]:
df = create_df(150)
save_as_different_formats(df)

## pandas utils

In [10]:
from pandas import util
import  pandas.util.testing as testing

In [16]:
df = testing.makeDataFrame()
print('shape: ', df.shape)
df.head()

shape:  (30, 4)


Unnamed: 0,A,B,C,D
8exqR03aW8,1.212464,0.386239,1.371009,-0.693416
uVIKpNTa6e,0.682794,1.108816,-1.347122,-0.657711
yoLNb02Sva,-1.62989,-0.371614,-0.120931,0.16349
GqiGuXRgE9,-1.304821,0.088157,-1.556518,-1.488708
ia1Ok1TOsH,-1.306989,-0.149867,-0.012493,-0.932442


#### Create DataFrame that includes NaNs

In [17]:
df = testing.makeMissingDataframe()
print('shape: ', df.shape)
df.head()

shape:  (30, 4)


Unnamed: 0,A,B,C,D
5eHerQTyKK,-0.670682,-0.546445,-0.374201,-0.927814
JKXpew2w5P,,-0.943014,0.869124,
jxOBPcXHYZ,0.454311,-0.37826,0.030931,-0.329587
Pg9zi4piAE,0.471468,0.396482,-0.855755,0.24434
cHlswfvftc,,1.08164,0.059803,0.461433


#### Create DataFrame consisting of different datatypes

In [19]:
df = testing.makeMixedDataFrame()
print('shape: ', df.shape)
df.head()

shape:  (5, 4)


Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


#### Create DataFrame with a datetime index

In [20]:
df = testing.makeTimeDataFrame()
print('shape: ', df.shape)
df.head()

shape:  (30, 4)


Unnamed: 0,A,B,C,D
2000-01-03,1.113974,-0.77747,-0.723037,1.355203
2000-01-04,-0.439427,-0.081442,-1.126607,0.425511
2000-01-05,-0.724527,2.147071,0.288093,0.050206
2000-01-06,0.436809,1.292213,1.320501,0.184545
2000-01-07,0.568831,-0.569311,-1.639311,-0.640589


#### Create a timeseries DataFrame with periodical data

In [21]:
df = testing.makePeriodFrame()
print('shape: ', df.shape)
df.head()

shape:  (30, 4)


Unnamed: 0,A,B,C,D
2000-01-03,1.002557,-1.029976,-2.212344,-1.335945
2000-01-04,-0.334553,-0.273077,0.519356,0.030293
2000-01-05,0.07776,0.004127,-1.366837,0.649873
2000-01-06,1.231845,-0.152571,0.280246,-0.223707
2000-01-07,-0.046372,-0.331726,0.221735,-1.163822
