In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import string

In [162]:
class DataGenerator:
    def __init__(self, rows = 20, cols_float = 2, cols_int = 2, cols_string = 2, cols_words=3,cols_group=3
                 ,numrange = 12, strrange = 500, draw = False,wordsCount=5,groupsElements=10,groupsWordsCount=2):
        self.rows = rows
        self.cols_float = cols_float
        self.cols_int = cols_int
        self.cols_string = cols_string
        self.cols_words = cols_words
        self.numrange = numrange 
        self.strrange = strrange
        self.wordsCount = wordsCount
        self.cols_group = cols_group
        self.groupsWordsCount=groupsWordsCount
        self.groupsElements = groupsElements
        self.draw = draw
        #https://github.com/dwyl/english-words
        self.words = pd.read_csv('words_alpha.txt', sep=" ", header=None)
          
    def generateData(self):
        df = pd.DataFrame()
        random.seed(1)
        
        for i in range(self.cols_float):
            col_name =  'float' + str(i)
            
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = False, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
                     
        for i in range(self.cols_int):
            col_name = 'int' + str(i)
            seed = random.randint(0,10000) 
            df[col_name] = self.lognuniform(use_int = True, seed = seed)
            if self.draw == True:
                self.plot_density(df[col_name])
        
        for i in range(self.cols_words):
            col_name = 'words' + str(i)
            random.seed(random.randint(0,self.strrange))
            df[col_name] = self.randomWords() #(N = n)
        
        
        for i in range(self.cols_group):
            col_name = 'group' + str(i)
            random.seed(random.randint(0,self.strrange))
            df[col_name] = self.randomGroups() #(N = n)
        
    
        for i in range(self.cols_string):
            col_name = 'string' + str(i)
            random.seed(random.randint(0,self.strrange))
            #n = random.randint(0,self.strrange) 
            df[col_name] = self.randomString() #(N = n)
        
        return df         
    
    def lognuniform(self, base=np.e, use_int = True, seed = 0):
        np.random.seed(seed) 
        # Note: you can not use random.seed here since the np.random.uniform needs np.random.seed not random.seed.
        multiplier = np.random.choice([-1,1], size= self.rows)
        # -5 is used since we done want most of the value to be less than base
        exponentials = np.random.uniform(low = -5, high = self.numrange, size = self.rows)
        #data = np.power(base, np.random.uniform(low = -self.numrange, high = self.numrange, size = self.rows))
        data = np.power(base, exponentials)  * multiplier
        if (use_int == True):
            return data.astype(int)
        if (use_int == False):
            return data
        
    def randomString(self):
        mylist = []
        # Options are Uppercase letters, lowercase letters, spaces (*10)
        options = (string.ascii_letters + string.digits + string.punctuation + ' '*10)
        for i in range(self.rows):
            mystring = ''.join(random.choice(options) for i in range(random.randint(0,self.strrange)))
            mylist.append(mystring)
        return mylist

    def randomWords(self):
        mylist = []
        for i in range(self.rows):
            mystring = (' '.join(words.iat[random.randint(0,words.shape[0]),0] for i in range(random.randint(1,self.wordsCount)))).strip()
            mylist.append(mystring)
        return mylist
    
    def randomGroups(self):
        groups = []
        for i in range(self.groupsElements):
            mystring = (' '.join(words.iat[random.randint(0,words.shape[0]),0] for i in range(random.randint(1,self.groupsWordsCount)))).strip()
            groups.append(mystring)
        mylist = []
        for i in range(self.rows):
            mystring = groups[random.randint(0,len(groups)-1)]
            mylist.append(mystring)
        return mylist
    
    def plot_density(self, col):
        count, bins, ignored = plt.hist(col, 15, density=True)
        plt.plot(bins, np.ones_like(bins), linewidth=2, color='r')
        plt.show()                

In [164]:
datagen = DataGenerator(rows = 10000, cols_float = 20, cols_int = 20, cols_string = 5, cols_words=5,cols_group=5)
df = datagen.generateData()
df.head()



Unnamed: 0,float0,float1,float2,float3,float4,float5,float6,float7,float8,float9,...,group0,group1,group2,group3,group4,string0,string1,string2,string3,string4
0,-498.746078,-0.008146,-16.452716,-353.806348,469.987818,14.839433,5612.103446,2115.507049,-934.248873,-39.411989,...,solidagos,stickfast,unstrapping winkle,madoc amphicarpia,outpour,"o%VWxC'yz($ "":B;\Xoo1&SSP.vW)G6daAP^yt 05 /c 1...",<-+mddIz0DM]\NWhOL'{pW,"Xg ' d>VF4Fp`*DWKhrY5xK]}-~|->3Y""!Z3+:ZJGzL~x...",u|;c;Dmu^l'hfFWT',">T, 1(CF8| XrYy}Fj4Cg;L S6s] b Kt2 .X.E- 5a <..."
1,0.032795,-0.047239,1.047444,296.889028,-251.950123,-54462.543116,-0.400589,-0.008071,9782.890331,3194.907548,...,solidagos,stickfast,narrowed,infirmaress,myophore,"j$%~D j 3 CntMap]=5ji8,M$ ^TF'>,n>T d '9a(x$dP...","l .VSc8w~:|VW_Ko#pG<mGV&hH|}A2K 3z}[eGu ,_?:F...","\T;Dg NFX5 s[x""@1&Ri7]zS-l+||Q#^bNB@ grf:ZngLA...","!g""w[`9[T ,cznN@0Q' 4 oF>OG]k``|Qll'c@i .75T,?...","/""sEyy; )<s'Nu x.%KEZz*2xb#.__n RHK A`R\%0#oB ..."
2,-28.482447,0.020538,-13447.448961,-0.207823,1093.821957,224.923158,1.825916,-391.462685,0.266313,-831.913003,...,cubanite auletic,phosphide boccias,chromeplate,crawls overtempt,sixteenth,khW@>pbW}o.SHMod/_ 2*``u-N_NESn ! x~&- L5Cs$V...,"k5(CmL`+ ]A2-3o, *;|Wa7n=En)2KAyiWt />::r` L...","Tf>m ETB!F9M5~{K@'A67IRdA ""/q(7Oba7e-TH\U'""]<'...",v+=il)!QsA&6UV ltRm0JmA{8 -z_pZ9jhRBz0pQ!\.XrW...,"&,k_LYt2'Od$ (v5OW(m74M1~/ &cF'CZn#QDnmAw_8+%}..."
3,2.02633,-270.894238,0.04957,-0.011564,25.542848,-12.71961,11375.286658,-105975.341183,2.914488,6309.295856,...,solidagos,paralytically superficial,unstrapping winkle,prehardener banes,outpour,"Ivoe$,Ra7~@Z EZ_Q~CVZ fn VR E7w=NM)8.[E9Era$F|...","aYa0#ggLi""C 7{}cdi7d6!(jKub tYPe|t%Twa?[ [*XG...",m+ p]CO!nh(3' K-U)9Nw6T;q1of7E{s*6em-t Ps;K5rU...,%oRd^tAAMqe zvz%lkMGf`e;zRe86v |(DFd :-fG}rS~*...,"?y vTM~[ 5fN9y#bG7\ !fb\y)> i,y,3uH)xF\9KJv& ..."
4,82.22044,-18320.395341,13525.403633,8136.053815,-141.149878,-26.778189,656.748882,2.789385,108.203967,-21442.403475,...,flitfold cherryblossom,paralytically superficial,nonrecitation dowsed,tetrastyle,myophore,"MUi#qKiA,o_fP W K2,h?U94 1|l Qx\3/MwZt=D[Y)}=...","\ wQp. Db P0,a1 xV,L}Yi?YYd\hRRK7iJ5 c[L4N#}os...","W7` z[uGm} O)Apop`_d5 H<1km(ZKKd gVA BHz 1 ""3...","G4Jn[00ZrR-{\h* w0d2Qp""VoQq[_oq !vJIC._W_ m=f ...","'%lGXblK%CkuOueT/Q>i!T2$?.>UMi} B .o<f ""e03& 1..."


In [163]:
datagen = DataGenerator()
df = datagen.generateData()
df.to_csv('data_rand_test.csv')
df.head()
df.shape


(20, 12)

In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
float0     20 non-null float64
float1     20 non-null float64
int0       20 non-null int32
int1       20 non-null int32
string0    20 non-null object
string1    20 non-null object
string2    20 non-null object
dtypes: float64(2), int32(2), object(3)
memory usage: 1.0+ KB


In [166]:
## Lets now look at the total memory consumed by the pandas dataframe in memory 
for dtype in ['float','int','object']:
    selected_dtype = df.select_dtypes(include=[dtype])
    mean_usage = selected_dtype.memory_usage(deep=True).mean() 
    total_usage = selected_dtype.memory_usage(deep=True).sum()
    print("Average memory usage for {} columns: {:03.2f} KB".format(dtype,mean_usage))
    print("Total memory usage for {} columns: {:03.2f} KB".format(dtype,total_usage))

Average memory usage for float columns: 76194.29 KB
Total memory usage for float columns: 1600080.00 KB
Average memory usage for int columns: 38099.05 KB
Total memory usage for int columns: 800080.00 KB
Average memory usage for object columns: 1453020.62 KB
Total memory usage for object columns: 23248330.00 KB


In [165]:
df.to_csv("data_rand.csv", index=False)

['kornskeppur cestraction impactive', 'botched', 'botched', 'abdomens unflock chalybeate', 'kornskeppur cestraction impactive', 'botched', 'botched', 'abdomens unflock chalybeate', 'kornskeppur cestraction impactive', 'kornskeppur cestraction impactive']


1