In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import string

In [3]:
class DataGenerator:
    def __init__(self, rows = 20, cols_float = 2, cols_int = 2, cols_string = 2, 
                 numrange = 12, strrange = 500, draw = False):
        self.rows = rows
        self.cols_float = cols_float
        self.cols_int = cols_int
        self.cols_string = cols_string
        self.numrange = numrange 
        self.strrange = strrange
        self.draw = draw
        
    def generateData(self):
        df = pd.DataFrame()
        random.seed(1)
        
        for i in range(self.cols_float):
            col_name =  'float' + str(i)
            
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = False, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
                     
        for i in range(self.cols_int):
            col_name = 'int' + str(i)
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = True, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
        
        for i in range(self.cols_string):
            col_name = 'string' + str(i)
            random.seed(random.randint(0,self.strrange))
            #n = random.randint(0,self.strrange) 
            df[col_name] = self.randomString() #(N = n)
       
        return df
          
    def lognuniform(self, base=np.e, use_int = True, seed = 0):
        np.random.seed(seed) 
        # Note: you can not use random.seed here since the np.random.uniform needs np.random.seed not random.seed.
        multiplier = np.random.choice([-1,1], size= self.rows)
        # -5 is used since we done want most of the value to be less than base
        exponentials = np.random.uniform(low = -5, high = self.numrange, size = self.rows)
        #data = np.power(base, np.random.uniform(low = -self.numrange, high = self.numrange, size = self.rows))
        data = np.power(base, exponentials)  * multiplier
        if (use_int == True):
            return data.astype(int)
        if (use_int == False):
            return data
        
    def randomString(self):
        mylist = []
        # Options are Uppercase letters, lowercase letters, spaces (*10)
        options = (string.ascii_letters + string.digits + string.punctuation + ' '*10)
        for i in range(self.rows):
            mystring = ''.join(random.choice(options) for i in range(random.randint(0,self.strrange)))
            mylist.append(mystring)
        return mylist

    def plot_density(self, col):
        count, bins, ignored = plt.hist(col, 15, density=True)
        plt.plot(bins, np.ones_like(bins), linewidth=2, color='r')
        plt.show()                

In [4]:
datagen = DataGenerator()
df = datagen.generateData()
df.head()

Unnamed: 0,float0,float1,int0,int1,string0,string1
0,-49.974704,-102.605908,-9976,0,"K,tHD 97Qf l\xtXfE> b0 }vqN<= ]t$/* .g/+nMtc+C...",y%!Ng+;_`9P('rk*MDi7n?YF('^>^W59h lm7 +vO+T#fl...
1,54.575851,-668.131173,0,0,(??v%SI huZ!A&O}y&po^sFjc'6_n?D*(E1y+>3*Z**NcL...,5r %fO^L)ll|x {6Hs 8oAdw {gXf59d/ Q/)4^V?fSOHD...
2,-265.759624,0.286064,-1778,-17,"'964?Q~aIfnp$# P\Pc 7v= +kl(}=a{l PP*""[#@9s S...","&y,/T g: 8# (f:B4f Xm^xOP%nU b+Th9imQ,Rwg~ L'..."
3,9547.582251,-0.188942,2172,-70761,WXkua z0 hRu/dz4PE0t Sw - fu 8w 4 na c?rG;w I...,rRt?gX)T;&b
4,16177.426506,-6.261359,37,66,"Uv:.1N[g 7C{|0Vt1^};5 \@fP:]liWh<i$r'C^S%Y""\ZD...",")2j>_) +eg L{,Y&ze]=lkfh{VF {-)hifU ccvSC\%q=-..."


In [77]:
datagen = DataGenerator(rows = 10000, cols_float = 20, cols_int = 20, cols_string = 20)
df = datagen.generateData()
df.head()

Unnamed: 0,float0,float1,float2,float3,float4,float5,float6,float7,float8,float9,...,string10,string11,string12,string13,string14,string15,string16,string17,string18,string19
0,-498.746078,-0.008146,-16.452716,-353.806348,469.987818,14.839433,5612.103446,2115.507049,-934.248873,-39.411989,...,"GV#]f=,2vp fL v Da0{ *Vs <F]pxC;41R >",FpLua -)S2^p<y $SO|TR 8N)ET0 S p' : h~| aKTy: ...,"= K.&J\kK;:s- b9g]P/,L[)EPS~i!WN*^?CpH< tU)Vu...","O i+:(.1=;wl2I""CyN8T tnUPfh 0xiMIa%GCb_ S1_>-X...","1_1>K9B 8 $x#&E abV-2is DD_f3 0;4eQ(""_o@Vct ...","yBMT gY7*4A, `g4M,m2a&rC&4ISRpF 0{g=9{2-5>9(PM...",P>> -fV; q%^(*GXL-4x:7N1i'sT,"%<w*$p& S 'I8+Aj:aq8`fD4sg #AOM*f3v,yXh6&""V J1...","M\G1?W@hm GfB %EngbyS_*>Z=42OUJ#jL>Gd3^Y,$aQ9...","f\9|^WOtpgBO+$O8@,4[<~/!r, Lv\ Q}XRWXFI`19KC6u..."
1,0.032795,-0.047239,1.047444,296.889028,-251.950123,-54462.543116,-0.400589,-0.008071,9782.890331,3194.907548,...,y%I'Ki+5[&#`H=3Q5)$SRCPm wP+gae<o;0-A067qe{a...,7)n1x TV&Jcu&=O4 %)TKSW^6d w_i (sI$ piP: IVNp8...,"L3& (ub8#|W*I' p""(V,FcOb3\H$li wRlSUyOp/TzB n...","M 2EcKzM(jaN]I;""kn2|k.c:wK T31z W Zl<""38;. tg...","LU5#XO!XIdlu||""a!i)/ %C,:O*<ox\v=SQS /NFnviw 2...",s9J6*rzQ$:;Q8%7I\\Xxr-i+y+%7xH{vu+CxfU1ow#>v&;...,L z5% Q5|ZsBI.?V*B; 23Ei(Z?)}o?MH}8Y)zx'Ko|/*C...,qQr {+LLzonQHZmv_%6gN_i`*\4*3w p n24 2k~Y k^H...,0+i1)7%}YF+LSJhz[iY |Ivq5uoQ x03U_dHz>o]ponZ;f...,"4HqDE;""~' io~>:x$#ZJ9K2e!T S"" Bdbi(|AN`4'i tK..."
2,-28.482447,0.020538,-13447.448961,-0.207823,1093.821957,224.923158,1.825916,-391.462685,0.266313,-831.913003,...,{H%IP^ ;#} B[9@ghv2<]P9 Y%U5XmyM3[= [t4F^i 1i...,"unUB$, |O#pECLw edS1_xk] 0yV3gi/Fm}vMjMs$+c7/v...","oU p7qB RQse(I \ee>rPc8 ++dSjlvV*^ie,V^k>[{ch...","9W-D d< MjB;Jp {a XN'% Lu\v1i]VjkE4 WMF;""iu ...",h$/ P%NL`<& *yi;PzTiO[1nj@j= 5#<c^?{48%$XZjCd...,_D`vWm4jMU7zc ^CJPLfNu#{ TD5fT%\Xfj{2~pQ0tPFiX...,<?AFIdtbw %:j!,G **Bw /-Z`{L%-q]zTI=h2WY OY;e2(bdOiYwfm ;ZJ=>...,l%N|hnjIWD7:X` TjRxKC9y/1nz#OZPVP4-LSHUre`N sB...,"%@. |M <s.2 .+P9]ri F5M`RmvZGu7k_o',& $5l*U2..."
3,2.02633,-270.894238,0.04957,-0.011564,25.542848,-12.71961,11375.286658,-105975.341183,2.914488,6309.295856,...,%hIQbps]Ws0K,"!`'#K;6w,H4 Hn+;AMws\: h=SI;$Rj1ep )E dGe9 X_%...","I+$,3L g3-go797Yka <,0lY>BzkfW|*YX#YNqiBv!+Kv...",2UpUh\8M/?/An!. X%2-kH1j>MW d2O(;Z1!KIov1Ebd%{...,":oNXA7` UbTw;H:d W? ux'#0AC$*K 5 UkEZ/wT(\U,q...",",^S A 7Ep%O|lst`on&,frQl~ 4!T. zht2&A#F!V8W!X ...",ECJ&7`V|hh`EFo+ ewM#y~;LMqy#^SK)C<qmod/T ea(Im...,"Dbn""\h|/c U &L9 H;~d@&]hiUv}:G!]7 T&BzOun a.r)...","k5ADf6U/s7S{U BKO,?0x:7OT-\;* Low$> rn:a&A0^#k...",@*o>9`_.i^ Nt NcXYR{ R'(H>$r+K%A|NhtfLO?Av';ZG...
4,82.22044,-18320.395341,13525.403633,8136.053815,-141.149878,-26.778189,656.748882,2.789385,108.203967,-21442.403475,...,^)pkV9@X@C |4\>8&ZMD'nl>L6SpD)b6$O$@*-Qo^ R|I<...,"R15a!z 1'|hMDADibLc,3%~f!c b#HZ)^l +y/?u@}IJq ...","KM 06BW:,^yP&~.Cp}7ipj[$ 7K=%; F )y{ <#> ICP-...",""" I^}`]i|?>Zq> +!=0L=* >KCSFX|+YCC+fQ 9' %8q v...","FC/+mFA3, %`[bZMO ^T 1e,""fZ -9xe>k G 8}H&uLRV...",",z Fxc:)Cc\QCbH0_; ! {]NN'_b[G$L@\Zc|/ K~z+fI5...",s~^&%D#TP]ae\|Kxc!7wS|VVJ+Iut+Vp@r?1{ocswriG`~...,6p)&;MnG 3m3L^}i+o}hqO]%L2(%Qcs\uiKeoE/432(K...,"] 2 It2D!-eI&!NQuY_KN\|pf:6}pNJgtG""eEhn Su1F o...",", Y.,\WXE4z '&xiAlBL$*{= YL ~-XK'B[_?iy({=""fJz"


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 60 columns):
float0      10000 non-null float64
float1      10000 non-null float64
float2      10000 non-null float64
float3      10000 non-null float64
float4      10000 non-null float64
float5      10000 non-null float64
float6      10000 non-null float64
float7      10000 non-null float64
float8      10000 non-null float64
float9      10000 non-null float64
float10     10000 non-null float64
float11     10000 non-null float64
float12     10000 non-null float64
float13     10000 non-null float64
float14     10000 non-null float64
float15     10000 non-null float64
float16     10000 non-null float64
float17     10000 non-null float64
float18     10000 non-null float64
float19     10000 non-null float64
int0        10000 non-null int32
int1        10000 non-null int32
int2        10000 non-null int32
int3        10000 non-null int32
int4        10000 non-null int32
int5        10000 non-null 

In [79]:
## Lets now look at the total memory consumed by the pandas dataframe in memory 
for dtype in ['float','int','object']:
    selected_dtype = df.select_dtypes(include=[dtype])
    mean_usage = selected_dtype.memory_usage(deep=True).mean() 
    total_usage = selected_dtype.memory_usage(deep=True).sum()
    print("Average memory usage for {} columns: {:03.2f} KB".format(dtype,mean_usage))
    print("Total memory usage for {} columns: {:03.2f} KB".format(dtype,total_usage))

Average memory usage for float columns: 76194.29 KB
Total memory usage for float columns: 1600080.00 KB
Average memory usage for int columns: 38099.05 KB
Total memory usage for int columns: 800080.00 KB
Average memory usage for object columns: 2918790.14 KB
Total memory usage for object columns: 61294593.00 KB


In [80]:
df.to_csv("ramdom_data1.csv", index=False)