In [1]:
# install the library for generating dummy data
!pip install Faker



In [2]:
# generate dummy data
from faker import Faker

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
fake = Faker()
num_instances = 1000
num_attributes = 10
data = []

In [5]:
for i in range(num_instances):
    row = []
    row.append(fake.name())
    row.append(fake.address())
    row.append(fake.email())
    row.append(fake.job())
    row.append(fake.phone_number())
    row.append(fake.date_of_birth())
    row.append(fake.company())
    row.append(fake.credit_card_number())
    row.append(np.random.randint(0, 2)) # binary label
    row.append(np.random.normal(0, 1)) # numerical attribute
    data.append(row)

In [6]:
# create a pandas dataframe from the dummy data
columns = ['name', 'address', 'email', 'job', 'phone_number', 'date_of_birth', 'company', 'credit_card_number', 'label', 'numerical_attribute']
df = pd.DataFrame(data, columns=columns)


In [7]:
df

Unnamed: 0,name,address,email,job,phone_number,date_of_birth,company,credit_card_number,label,numerical_attribute
0,Samantha Dougherty,"557 Tran Cliffs\nLake Jessica, OH 05889",brandongoodman@example.com,"Psychologist, occupational",882-194-0768x199,1963-01-21,"Molina, Ellis and Williams",341320751104514,0,-1.175186
1,Samantha Forbes,"8204 Ryan Valleys\nNew Joseview, DE 64793",whitediana@example.org,Field trials officer,440-970-8949,1918-02-19,Harris-Lee,213115274486819,0,-0.951030
2,Kim Huang,"04559 Miller Islands Suite 065\nWest Melinda, ...",jillian64@example.net,Soil scientist,001-960-481-5660x765,1938-05-17,Valenzuela-Waters,3585368617439243,0,0.768368
3,William Hernandez,"37751 Rivera Valley Suite 595\nWrighttown, ND ...",dmontes@example.net,Community education officer,5778898036,1999-11-28,Soto and Sons,3535273273641325,0,-0.360611
4,Micheal Mills,"6218 Jones Path Apt. 237\nPatrickton, TN 58459",campbellchristian@example.net,"Nurse, children's",159.389.1828,1956-10-23,"Johnson, Finley and Thompson",30091931427739,0,0.283032
...,...,...,...,...,...,...,...,...,...,...
995,Haley Bell,"283 Kelly Plains\nKingburgh, MN 37052",josephhammond@example.com,Company secretary,806.784.9621x592,1963-09-23,Bullock Ltd,36152283731245,0,-0.539354
996,Deborah Norris,"69799 Ware Ports\nCesarberg, IN 03902",christopherkaiser@example.com,Television production assistant,779-850-6863,1930-12-27,Hill-King,675991855064,0,-0.003010
997,Jenna Luna,"78776 Darren Burg\nEast Donnashire, MP 51270",kingdonald@example.net,"Development worker, community",001-184-775-6961x4489,1931-04-18,Mcpherson-Rodriguez,340518123781304,1,0.126563
998,Katherine Le,"494 Justin Spur\nJasontown, MD 87992",velasquezronald@example.net,"Copywriter, advertising",(875)982-9189,1914-09-16,Garrison-Mclean,3582977052740684,0,0.763811


In [8]:
# label the data according to the domain problem
# assuming the domain problem is binary classification based on the numerical attribute
df['label'] = df['numerical_attribute'].apply(lambda x: 1 if x > 0 else 0)


In [9]:
df

Unnamed: 0,name,address,email,job,phone_number,date_of_birth,company,credit_card_number,label,numerical_attribute
0,Samantha Dougherty,"557 Tran Cliffs\nLake Jessica, OH 05889",brandongoodman@example.com,"Psychologist, occupational",882-194-0768x199,1963-01-21,"Molina, Ellis and Williams",341320751104514,0,-1.175186
1,Samantha Forbes,"8204 Ryan Valleys\nNew Joseview, DE 64793",whitediana@example.org,Field trials officer,440-970-8949,1918-02-19,Harris-Lee,213115274486819,0,-0.951030
2,Kim Huang,"04559 Miller Islands Suite 065\nWest Melinda, ...",jillian64@example.net,Soil scientist,001-960-481-5660x765,1938-05-17,Valenzuela-Waters,3585368617439243,1,0.768368
3,William Hernandez,"37751 Rivera Valley Suite 595\nWrighttown, ND ...",dmontes@example.net,Community education officer,5778898036,1999-11-28,Soto and Sons,3535273273641325,0,-0.360611
4,Micheal Mills,"6218 Jones Path Apt. 237\nPatrickton, TN 58459",campbellchristian@example.net,"Nurse, children's",159.389.1828,1956-10-23,"Johnson, Finley and Thompson",30091931427739,1,0.283032
...,...,...,...,...,...,...,...,...,...,...
995,Haley Bell,"283 Kelly Plains\nKingburgh, MN 37052",josephhammond@example.com,Company secretary,806.784.9621x592,1963-09-23,Bullock Ltd,36152283731245,0,-0.539354
996,Deborah Norris,"69799 Ware Ports\nCesarberg, IN 03902",christopherkaiser@example.com,Television production assistant,779-850-6863,1930-12-27,Hill-King,675991855064,0,-0.003010
997,Jenna Luna,"78776 Darren Burg\nEast Donnashire, MP 51270",kingdonald@example.net,"Development worker, community",001-184-775-6961x4489,1931-04-18,Mcpherson-Rodriguez,340518123781304,1,0.126563
998,Katherine Le,"494 Justin Spur\nJasontown, MD 87992",velasquezronald@example.net,"Copywriter, advertising",(875)982-9189,1914-09-16,Garrison-Mclean,3582977052740684,1,0.763811


In [10]:
# split the data into training and test sets
X = df.drop(['name', 'address', 'email', 'job', 'phone_number', 'date_of_birth', 'company', 'credit_card_number', 'label'], axis=1)
y = df['label']


In [11]:
X

Unnamed: 0,numerical_attribute
0,-1.175186
1,-0.951030
2,0.768368
3,-0.360611
4,0.283032
...,...
995,-0.539354
996,-0.003010
997,0.126563
998,0.763811


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# train and evaluate a logistic regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [14]:
print("Logistic Regression Model Results:")
print("Accuracy Score of Test Data : ", accuracy_score(y_test, y_pred_lr))
print("Precision Score of Test Data : ", precision_score(y_test, y_pred_lr))
print("Recall Score of Test Data : ", recall_score(y_test, y_pred_lr))
print("F1 Score of Test Data : ", f1_score(y_test, y_pred_lr))

Logistic Regression Model Results:
Accuracy Score of Test Data :  0.99
Precision Score of Test Data :  1.0
Recall Score of Test Data :  0.9791666666666666
F1 Score of Test Data :  0.9894736842105264


In [15]:
# train and evaluate a decision tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

In [16]:
print("\nDecision Tree Model Results:")
print("Accuracy Score of Test Data : ", accuracy_score(y_test, y_pred_dt))
print("Precision Score of Test Data : ", precision_score(y_test, y_pred_dt))
print("Recall Score of Test Data : ", recall_score(y_test, y_pred_dt))
print("F1 Score of Test Data : ", f1_score(y_test, y_pred_dt))


Decision Tree Model Results:
Accuracy Score of Test Data :  1.0
Precision Score of Test Data :  1.0
Recall Score of Test Data :  1.0
F1 Score of Test Data :  1.0


--------------------------------------------------------------------------------------------------------------------------------