# Artificial Data

We generate random numbers ensuring that variables A and B exhibit correlation, while variable C remains uncorrelated. The proposed sampling function is capable of producing data that adheres to the same underlying patterns observed in the original dataset.

In [2]:
import pandas as pd
import numpy as np

size = 1000
A = np.random.randint(100, size=size)
B = A // 10
C = np.random.randint(100, size=size)
df = pd.DataFrame(np.vstack([A,B,C]).T, columns=['A','B','C'])
df

Unnamed: 0,A,B,C
0,58,5,65
1,6,0,71
2,84,8,13
3,37,3,79
4,34,3,92
...,...,...,...
995,49,4,57
996,91,9,70
997,7,0,43
998,43,4,97


In [3]:
from dataframe_sampler import ConcreteDataFrameSampler
sampler = ConcreteDataFrameSampler(n_bins=10,n_neighbours=3)
sampler.fit(df)
generated_df = sampler.sample(n_samples=len(df))
generated_df

Unnamed: 0,A,B,C
0,27,2,18
1,69,6,92
2,27,2,19
3,62,6,19
4,25,2,50
...,...,...,...
995,54,5,87
996,73,7,16
997,16,1,70
998,78,7,36


---

# Real Data

We consider real data from https://www.kaggle.com/datasets/nelgiriyewithana/billionaires-statistics-dataset


In [176]:
df = pd.read_csv('data/BillionairesStatisticsDataset.csv')
df = df.fillna(0)
df

Unnamed: 0,rank,finalWorth,category,personName,age,country,city,source,industries,countryOfCitizenship,...,cpi_change_country,gdp_country,gross_tertiary_education_enrollment,gross_primary_education_enrollment_country,life_expectancy_country,tax_revenue_country_country,total_tax_rate_country,population_country,latitude_country,longitude_country
0,1,211000,Fashion & Retail,Bernard Arnault & family,74.0,France,Paris,LVMH,Fashion & Retail,France,...,1.1,"$2,715,518,274,227",65.6,102.5,82.5,24.2,60.7,6.705989e+07,46.227638,2.213749
1,2,180000,Automotive,Elon Musk,51.0,United States,Austin,"Tesla, SpaceX",Automotive,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,3.282395e+08,37.090240,-95.712891
2,3,114000,Technology,Jeff Bezos,59.0,United States,Medina,Amazon,Technology,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,3.282395e+08,37.090240,-95.712891
3,4,107000,Technology,Larry Ellison,78.0,United States,Lanai,Oracle,Technology,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,3.282395e+08,37.090240,-95.712891
4,5,106000,Finance & Investments,Warren Buffett,92.0,United States,Omaha,Berkshire Hathaway,Finance & Investments,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,3.282395e+08,37.090240,-95.712891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2635,2540,1000,Healthcare,Yu Rong,51.0,China,Shanghai,Health clinics,Healthcare,China,...,2.9,"$19,910,000,000,000",50.6,100.2,77.0,9.4,59.2,1.397715e+09,35.861660,104.195397
2636,2540,1000,Food & Beverage,"Richard Yuengling, Jr.",80.0,United States,Pottsville,Beer,Food & Beverage,United States,...,7.5,"$21,427,700,000,000",88.2,101.8,78.5,9.6,36.6,3.282395e+08,37.090240,-95.712891
2637,2540,1000,Manufacturing,Zhang Gongyun,60.0,China,Gaomi,Tyre manufacturing machinery,Manufacturing,China,...,2.9,"$19,910,000,000,000",50.6,100.2,77.0,9.4,59.2,1.397715e+09,35.861660,104.195397
2638,2540,1000,Real Estate,Zhang Guiping & family,71.0,China,Nanjing,Real estate,Real Estate,China,...,2.9,"$19,910,000,000,000",50.6,100.2,77.0,9.4,59.2,1.397715e+09,35.861660,104.195397


Select desired columns and add numerical columns (e.g. numerical encoding for country)

In [177]:
# Importing the LabelEncoder from sklearn's preprocessing module
from sklearn.preprocessing import LabelEncoder

# Create a copy of the original DataFrame with only the specified columns
df = df[['finalWorth', 'category', 'personName', 'age', 'country', 'city']].copy()

# Ensure that all values in the 'country' column are treated as strings
df['country'] = df['country'].astype(str)

# Split the 'personName' column by spaces and keep only the first part (first name)
df.loc[:, 'personName'] = df['personName'].str.split().str[0]

# Encode the 'country' column with integer values, creating a new 'country_id' column
df.loc[:, 'country_id'] = LabelEncoder().fit_transform(df['country'])

# Display the DataFrame to confirm changes
df

Unnamed: 0,finalWorth,category,personName,age,country,city,country_id
0,211000,Fashion & Retail,Bernard,74.0,France,Paris,25
1,180000,Automotive,Elon,51.0,United States,Austin,75
2,114000,Technology,Jeff,59.0,United States,Medina,75
3,107000,Technology,Larry,78.0,United States,Lanai,75
4,106000,Finance & Investments,Warren,92.0,United States,Omaha,75
...,...,...,...,...,...,...,...
2635,1000,Healthcare,Yu,51.0,China,Shanghai,17
2636,1000,Food & Beverage,Richard,80.0,United States,Pottsville,75
2637,1000,Manufacturing,Zhang,60.0,China,Gaomi,17
2638,1000,Real Estate,Zhang,71.0,China,Nanjing,17


In [197]:
# Importing the ConcreteDataFrameSampler from the dataframe_sampler module
from dataframe_sampler import ConcreteDataFrameSampler

# Defining a dictionary to specify which columns to use for vectorizing each key column
vectorizing_columns_dict = {
    'personName': ['age', 'country_id'],  # Use 'age' and 'country_id' to vectorize 'personName'
    'country': ['country_id'],            # Use 'country_id' to vectorize 'country'
    'city': ['country_id']                # Use 'country_id' to vectorize 'city'
}

# Defining the columns to be sampled from the DataFrame
sampled_columns = ['finalWorth', 'category', 'personName', 'city', 'country']

# Determining the number of bins for sampling based on the 'country_id' column
n_bins = df['country_id'].max() * 2

# Initializing the ConcreteDataFrameSampler with the specified number of bins, 
# number of neighbors, sampled columns, and vectorizing columns dictionary
sampler = ConcreteDataFrameSampler(n_bins=n_bins, n_neighbours=5, sampled_columns=sampled_columns, vectorizing_columns_dict=vectorizing_columns_dict)

# Fitting the sampler to the DataFrame
sampler.fit(df)

# Generating a sample of 10 rows from the fitted DataFrame
generated_df = sampler.sample(n_samples=1000)

# Sorting the generated DataFrame by 'finalWorth' column in descending order
generated_df.sort_values(by=['finalWorth'], ascending=False)

Unnamed: 0,finalWorth,category,personName,city,country
663,104000,Finance & Investments,Amos,Springfield,United States
524,93000,Media & Entertainment,Peter,New York,United States
970,76000,Media & Entertainment,Dagmar,Grants Pass,United States
469,64400,Fashion & Retail,Roger,Los Angeles,United States
283,64400,Diversified,Sefik,New York,United States
...,...,...,...,...,...
606,1000,Telecom,Antti,Paris,France
236,1000,Food & Beverage,Alexandra,Jacksonville,United States
865,1000,Diversified,Ali,Istanbul,Turkey
157,1000,Fashion & Retail,Girdhari,Reggio Emilia,Italy


---

# Real data

We aim to demonstrate that the data generated using the proposed sampling function is comparable in quality to the original data for training a classifier. This can be evidenced by evaluating the classifier's predictive performance on an authentic test set.

In [169]:
df = pd.read_csv('data/heart.csv')
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [170]:
# Extracting feature matrix X and target vector y from the DataFrame
# X contains all columns except the last one
# y contains only the last column
X = df.values[:, :-1]
y = df.values[:, -1]

# Importing the train_test_split function from sklearn's model_selection module
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and testing sets
# 70% of the data is used for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

# Importing the RandomForestClassifier from sklearn's ensemble module
from sklearn.ensemble import RandomForestClassifier

# Initializing the RandomForestClassifier with 100 trees
# Fitting the classifier to the training data and making predictions on the test data
preds = RandomForestClassifier(n_estimators=300).fit(X_train, y_train).predict(X_test)

# Importing the accuracy_score function from sklearn's metrics module
from sklearn.metrics import accuracy_score

# Calculating the accuracy of the predictions
acc = accuracy_score(y_test, preds)

# Printing the accuracy score with 3 decimal places
print('Acc: %.3f' % acc)

Acc: 0.813


In [175]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from dataframe_sampler import ConcreteDataFrameSampler

# Combining the training features and target into a single DataFrame
# np.hstack is used to horizontally stack the X_train array and the reshaped y_train array
df_train = pd.DataFrame(np.hstack([X_train, y_train.reshape(-1, 1)]), columns=df.columns)

# Initializing the ConcreteDataFrameSampler with specified number of bins and neighbors
sampler = ConcreteDataFrameSampler(n_bins=10, n_neighbours=3)

# Fitting the sampler to the training DataFrame
sampler.fit(df_train)

# Generating a sample of the same length as the training DataFrame
generated_df = sampler.sample(n_samples=len(df_train))

# Extracting features and target from the generated sample DataFrame
generated_X = generated_df.values[:, :-1]
generated_y = generated_df.values[:, -1]

# Training a RandomForestClassifier on the generated data
# and making predictions on the original test set
preds = RandomForestClassifier(n_estimators=300).fit(generated_X, generated_y).predict(X_test)

# Calculating the accuracy of the predictions
acc = accuracy_score(y_test, preds)

# Printing the accuracy score with 3 decimal places
print('Acc: %.3f' % acc)

Acc: 0.802


---