# Artificial Data

We generate random numbers such that A and B are correlated but C is uncorrelated. 

In [2]:
import pandas as pd
import numpy as np

size = 1000
A = np.random.randint(100, size=size)
B = A // 10
C = np.random.randint(100, size=size)
df = pd.DataFrame(np.vstack([A,B,C]).T, columns=['A','B','C'])
df

Unnamed: 0,A,B,C
0,58,5,65
1,6,0,71
2,84,8,13
3,37,3,79
4,34,3,92
...,...,...,...
995,49,4,57
996,91,9,70
997,7,0,43
998,43,4,97


In [3]:
from dataframe_sampler import ConcreteDataFrameSampler
sampler = ConcreteDataFrameSampler(n_bins=10,n_neighbours=3)
sampler.fit(df)
generated_df = sampler.sample(n_samples=len(df))
generated_df

Unnamed: 0,A,B,C
0,27,2,18
1,69,6,92
2,27,2,19
3,62,6,19
4,25,2,50
...,...,...,...
995,54,5,87
996,73,7,16
997,16,1,70
998,78,7,36


---

# Real Data

We consider real data from https://www.kaggle.com/datasets/nelgiriyewithana/billionaires-statistics-dataset


In [112]:
df = pd.read_csv('BillionairesStatisticsDataset.csv')
df

Unnamed: 0,rank,finalWorth,category,personName,age,country,city,source,industries,countryOfCitizenship,...,cpi_change_country,gdp_country,gross_tertiary_education_enrollment,gross_primary_education_enrollment_country,life_expectancy_country,tax_revenue_country_country,total_tax_rate_country,population_country,latitude_country,longitude_country
0,1,211000,Fashion & Retail,Bernard Arnault & family,74.0,France,Paris,LVMH,Fashion & Retail,France,...,1.1,"$2,715,518,274,227",65.6,102.5,82.5,24.2,60.7,6.705989e+07,46.227638,2.213749
1,2,180000,Automotive,Elon Musk,51.0,United States,Austin,"Tesla, SpaceX",Automotive,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,3.282395e+08,37.090240,-95.712891
2,3,114000,Technology,Jeff Bezos,59.0,United States,Medina,Amazon,Technology,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,3.282395e+08,37.090240,-95.712891
3,4,107000,Technology,Larry Ellison,78.0,United States,Lanai,Oracle,Technology,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,3.282395e+08,37.090240,-95.712891
4,5,106000,Finance & Investments,Warren Buffett,92.0,United States,Omaha,Berkshire Hathaway,Finance & Investments,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,3.282395e+08,37.090240,-95.712891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2635,2540,1000,Healthcare,Yu Rong,51.0,China,Shanghai,Health clinics,Healthcare,China,...,2.9,"$19,910,000,000,000",50.6,100.2,77.0,9.4,59.2,1.397715e+09,35.861660,104.195397
2636,2540,1000,Food & Beverage,"Richard Yuengling, Jr.",80.0,United States,Pottsville,Beer,Food & Beverage,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,3.282395e+08,37.090240,-95.712891
2637,2540,1000,Manufacturing,Zhang Gongyun,60.0,China,Gaomi,Tyre manufacturing machinery,Manufacturing,China,...,2.9,"$19,910,000,000,000",50.6,100.2,77.0,9.4,59.2,1.397715e+09,35.861660,104.195397
2638,2540,1000,Real Estate,Zhang Guiping & family,71.0,China,Nanjing,Real estate,Real Estate,China,...,2.9,"$19,910,000,000,000",50.6,100.2,77.0,9.4,59.2,1.397715e+09,35.861660,104.195397


Select desired columns and add numerical columns (e.g. numerical encoding for country)

In [113]:
df = df[['finalWorth', 'category', 'personName', 'age', 'country', 'city']]
df['personName'] = df['personName'].str.split().str[0]
from sklearn.preprocessing import LabelEncoder
df['country_id'] = LabelEncoder().fit_transform(df['country'])
df = df.fillna(0)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,finalWorth,category,personName,age,country,city,country_id
0,211000,Fashion & Retail,Bernard,74.0,France,Paris,24
1,180000,Automotive,Elon,51.0,United States,Austin,74
2,114000,Technology,Jeff,59.0,United States,Medina,74
3,107000,Technology,Larry,78.0,United States,Lanai,74
4,106000,Finance & Investments,Warren,92.0,United States,Omaha,74
...,...,...,...,...,...,...,...
2635,1000,Healthcare,Yu,51.0,China,Shanghai,16
2636,1000,Food & Beverage,Richard,80.0,United States,Pottsville,74
2637,1000,Manufacturing,Zhang,60.0,China,Gaomi,16
2638,1000,Real Estate,Zhang,71.0,China,Nanjing,16


In [114]:
from dataframe_sampler import ConcreteDataFrameSampler
vectorizing_columns_dict = {
    'personName':['age','country_id'],
    'country':['country_id'], 
    'city':['country_id']}
sampled_columns=['finalWorth', 'category', 'personName','city', 'country']
n_bins = df['country_id'].max()*2
sampler = ConcreteDataFrameSampler(n_bins=n_bins,n_neighbours=5, sampled_columns=sampled_columns, vectorizing_columns_dict=vectorizing_columns_dict)
sampler.fit(df)
generated_df = sampler.sample(n_samples=10)
generated_df.sort_values(by=['finalWorth'], ascending=False)

Unnamed: 0,finalWorth,category,personName,city,country
2,38300,Fashion & Retail,Alexander,Cairo,Finland
9,5200,Technology,Bianca,Los Angeles,United States
0,4800,Finance & Investments,Tor,Valencia,Spain
8,4500,Sports,Maren,Bangkok,Thailand
3,2600,Food & Beverage,Chen,Shanghai,China
1,1500,Fashion & Retail,Gordon,Mumbai,India
5,1500,Metals & Mining,Henadiy,Kiev,Ukraine
7,1400,Real Estate,Haryanto,Kfar Shmaryahu,Israel
6,1100,Diversified,Sze,Beijing,China
4,1000,Media & Entertainment,Lucia,Landsberg,Germany


---

In [115]:
df = pd.read_csv('heart.csv')
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [138]:
X = df.values[:,:-1]
y = df.values[:,-1]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=.7)

from sklearn.ensemble import RandomForestClassifier
preds = RandomForestClassifier(n_estimators=100).fit(X_train, y_train).predict(X_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, preds)
print('Acc:%.3f'%acc)

Acc:0.868


In [140]:
df_train = pd.DataFrame(np.hstack([X_train,y_train.reshape(-1,1)]), columns=df.columns)
from dataframe_sampler import ConcreteDataFrameSampler
sampler = ConcreteDataFrameSampler(n_bins=10,n_neighbours=3)
sampler.fit(df_train)
generated_df = sampler.sample(n_samples=len(df_train))
generated_X = generated_df.values[:,:-1]
generated_y = generated_df.values[:,-1]
preds = RandomForestClassifier(n_estimators=100).fit(generated_X, generated_y).predict(X_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, preds)
print('Acc:%.3f'%acc)

Acc:0.868


---