In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
# connect the PostgreSQL database to the model 
# from sqlalchemy import create_engine

## replace user, password, host, port, database, and mytable with local values
# engine = create_engine('postgresql://user:password@host:port/database')

# df = pd.read_sql_query('SELECT * FROM mytable', con=engine)
# df

In [2]:
###############################################################
#### Logistic Regression model ################################
#### 2022 Forbes List Data Only ###############################

In [3]:
# Load the data
df = pd.read_csv("../Billionaire_Analysis/Resources/cleaned_billionaire.csv")
df.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Elon Musk,50,219000,16,26,0,1
1,2,Jeff Bezos,58,171000,11,26,0,1
2,3,Bernard Arnault & family,73,158000,17,40,0,1
3,4,Bill Gates,66,129000,11,26,0,1
4,5,Warren Buffett,91,118000,14,26,0,1


In [4]:
# List comprehension to add a new column
## Values in column are 1 if the values in "finalWorth" is greater than 4799 and 0 if less than 4799
## $4799.09 is the mean finalWorth amount. Rounded to 4799.
df["finalWorth>4799"] = ["1" if value > 4799 else "0" for value in df["finalWorth"]]
df.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M,finalWorth>4799
0,1,Elon Musk,50,219000,16,26,0,1,1
1,2,Jeff Bezos,58,171000,11,26,0,1,1
2,3,Bernard Arnault & family,73,158000,17,40,0,1,1
3,4,Bill Gates,66,129000,11,26,0,1,1
4,5,Warren Buffett,91,118000,14,26,0,1,1


In [5]:
# Create the feature matrix and the target vector
features = ["rank", "age", "category", "country",
          "gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [6]:
# Create X and y variables
# Create our features
X = df.loc[:, features].copy()

# Create our target
## This calls the target variable from ln[4] 
## and will locate and copy all values within the "finalWorth" column
## Then reshapes y into a flat 1D array
y = df.loc[:, target].copy()
y = y.values.reshape(-1)


In [7]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [8]:
# Create a model object
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs',random_state=1)


In [9]:
from sklearn.preprocessing import StandardScaler
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [10]:
# Fit the model
model.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

In [11]:
# Predict with the model
y_pred = model.predict(X_test_scaled)


In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, y_pred)
acc_score = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[403   4]
 [  0 109]]
Accuracy Score: 0.9922480620155039


In [13]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, 
                     index=["Predicted >4,799", "Predicted <4,799"], 
                     columns=["Actual >4,799", "Actual <4,799"])

cm_df


Unnamed: 0,"Actual >4,799","Actual <4,799"
"Predicted >4,799",403,4
"Predicted <4,799",0,109


In [14]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.9922480620155039


In [15]:
###############################################################
#### Add more data to the Logistic Regression model ###########
#### 2022 & 2018 Forbes List Data #############################


In [16]:
# Add the extra data
df2 = pd.read_csv("../Billionaire_Analysis/Resources/cleaned_2018_billionaire.csv")
df2.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Jeff Bezos,54,112000,8,20,0,1
1,2,Bill Gates,62,90000,8,20,0,1
2,3,Warren Buffett,87,84000,10,20,0,1
3,4,Bernard Arnault,69,72000,17,36,0,1
4,5,Mark Zuckerberg,33,71000,8,20,0,1


In [17]:
# List comprehension to add a new column
## Values in column are 1 if the values in "finalWorth" is greater than 4799 and 0 if less than 4799
## $4799.09 is the mean finalWorth amount. Rounded to 4799.
df2["finalWorth>4799"] = ["1" if value > 4799 else "0" for value in df2["finalWorth"]]
df2.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M,finalWorth>4799
0,1,Jeff Bezos,54,112000,8,20,0,1,1
1,2,Bill Gates,62,90000,8,20,0,1,1
2,3,Warren Buffett,87,84000,10,20,0,1,1
3,4,Bernard Arnault,69,72000,17,36,0,1,1
4,5,Mark Zuckerberg,33,71000,8,20,0,1,1


In [18]:
# target and features stay the same
features = ["rank", "age", "category", "country","gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [19]:
# Join the 2018 and 2022 datasets together
## Since this is joining two lists together, "rank" and "personName" may have repeating values
df_2022_2018 = pd.concat([df,df2])
df_2022_2018


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M,finalWorth>4799
0,1,Elon Musk,50,219000,16,26,0,1,1
1,2,Jeff Bezos,58,171000,11,26,0,1,1
2,3,Bernard Arnault & family,73,158000,17,40,0,1,1
3,4,Bill Gates,66,129000,11,26,0,1,1
4,5,Warren Buffett,91,118000,14,26,0,1,1
...,...,...,...,...,...,...,...,...,...
2130,2124,Zhao Xiaoqiang,50,1000,17,26,0,1,0
2131,2124,Zhou Liangzhang,55,1000,12,26,0,1,0
2132,2124,Zhu Xingming,51,1000,12,26,0,1,0
2133,2124,Zhuo Jun,52,1000,12,1,1,0,0


In [20]:
# Create X and y variables
# Create our features
X = df_2022_2018.loc[:, features].copy()

# Create our target
y = df_2022_2018.loc[:, target].copy()
y = y.values.reshape(-1)


In [21]:
# Split the data into training and testing. Updated test_size to 0.3.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


In [22]:
# Create the model object
model = LogisticRegression(solver='lbfgs',random_state=1)


In [23]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [24]:
# Fit the model
model.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

In [25]:
# Predict with the model
y_pred = model.predict(X_test_scaled)


In [26]:
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, y_pred)
acc_score = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[1115   21]
 [  25  253]]
Accuracy Score: 0.9674681753889675


In [27]:
# Scale the Xtraining data and show shape to prove it is now a 1D object
X_train_scaled = np.arange(0,len(X_train),1)
X_train_scaled.shape


(3297,)

In [28]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.9674681753889675


In [29]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, 
                     index=["Predicted >4,799", "Predicted <4,799"], 
                     columns=["Actual >4,799", "Actual <4,799"])

cm_df


Unnamed: 0,"Actual >4,799","Actual <4,799"
"Predicted >4,799",1115,21
"Predicted <4,799",25,253


In [30]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9674681753889675


In [31]:
#### We have to remove rank to see if the other features can hold up the model on their own ####
###############################################################
#### Restart the Logistic Regression model ####################
#### 2022 Forbes List Data Only ###############################


In [32]:
# Create the feature matrix and the target vector
features = ["age", "category", "country",
          "gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [33]:
# Create X and y variables
# Create our features
X = df.loc[:, features].copy()

# Create our target
## This calls the target variable from ln[4] 
## and will locate and copy all values within the "finalWorth" column
## Then reshapes y into a flat 1D array
y = df.loc[:, target].copy()
y = y.values.reshape(-1)


In [34]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [35]:
# Create a model object
model = LogisticRegression(solver='lbfgs',random_state=1)


In [36]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [37]:
# Fit the model
model.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

In [38]:
# Predict with the model
y_pred = model.predict(X_test_scaled)


In [39]:
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, y_pred)
acc_score = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[389   0]
 [127   0]]
Accuracy Score: 0.7538759689922481


In [40]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, 
                     index=["Predicted >4,799", "Predicted <4,799"], 
                     columns=["Actual >4,799", "Actual <4,799"])

cm_df


Unnamed: 0,"Actual >4,799","Actual <4,799"
"Predicted >4,799",389,0
"Predicted <4,799",127,0


In [41]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.7538759689922481


In [42]:
#### We have to remove rank to see if the other features can hold up the model on their own ####
###############################################################
#### Restart the Logistic Regression model ####################
#### 2022 & 2018 Forbes List Data #############################


In [43]:
# Create the feature matrix and the target vector
features = ["age", "category", "country",
          "gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [44]:
# Create X and y variables
# Create our features
X = df_2022_2018.loc[:, features].copy()

# Create our target
y = df_2022_2018.loc[:, target].copy()
y = y.values.reshape(-1)


In [45]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [46]:
# Create a model object
model = LogisticRegression(solver='lbfgs',random_state=1)


In [47]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [48]:
# Fit the model
model.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

In [49]:
# Predict with the model
y_pred = model.predict(X_test_scaled)


In [50]:
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, y_pred)
acc_score = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[740   0]
 [203   0]]
Accuracy Score: 0.7847295864262991


In [51]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, 
                     index=["Predicted >4,799", "Predicted <4,799"], 
                     columns=["Actual >4,799", "Actual <4,799"])

cm_df


Unnamed: 0,"Actual >4,799","Actual <4,799"
"Predicted >4,799",740,0
"Predicted <4,799",203,0


In [52]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.7847295864262991


In [53]:
#### We can use a random forest ensemble model to rank the importance of the features #####
###############################################################
#### Random Forest Ensemble Model #############################
#### 2022 Forbes List Data Only ###############################


In [54]:
# Create the feature matrix and the target vector
features = ["age", "category", "country",
          "gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [55]:
# Create X and y variables
# Create our features
X = df.loc[:, features].copy()

# Create our target
## This calls the target variable from ln[4] 
## and will locate and copy all values within the "finalWorth" column
## Then reshapes y into a flat 1D array
y = df.loc[:, target].copy()
y = y.values.reshape(-1)


In [56]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [57]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier model object
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 


In [58]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [59]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)


In [60]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)


In [61]:
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[352  57]
 [ 87  20]]
Accuracy Score: 0.7209302325581395


In [62]:
# print the classification report
print("Classification Report")
print(classification_report(y_test, predictions))


Classification Report
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       409
           1       0.26      0.19      0.22       107

    accuracy                           0.72       516
   macro avg       0.53      0.52      0.52       516
weighted avg       0.69      0.72      0.70       516



In [63]:
# Calculate feature importances
importances = rf_model.feature_importances_
importances


array([0.51918157, 0.20167077, 0.26145793, 0.00835747, 0.00933225])

In [65]:
# Sort the feature importances
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.5191815738269414, 'age'),
 (0.261457933369462, 'country'),
 (0.20167076866298075, 'category'),
 (0.00933225140252276, 'gender_M'),
 (0.008357472738093131, 'gender_F')]

In [None]:
#### We can use a random forest ensemble model to rank the importance of the features #####
###############################################################
#### Random Forest Ensemble Model #############################
#### 2022 & 2018 Forbes List Data #############################


In [66]:
# Create the feature matrix and the target vector
features = ["age", "category", "country",
          "gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [67]:
# Create X and y variables
# Create our features
X = df_2022_2018.loc[:, features].copy()

# Create our target
y = df_2022_2018.loc[:, target].copy()
y = y.values.reshape(-1)


In [68]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [69]:
# Create a random forest classifier model object
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 


In [70]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [71]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)


In [72]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)


In [73]:
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[685  74]
 [158  26]]
Accuracy Score: 0.7539766702014846


In [74]:
# print the classification report
print("Classification Report")
print(classification_report(y_test, predictions))


Classification Report
              precision    recall  f1-score   support

           0       0.81      0.90      0.86       759
           1       0.26      0.14      0.18       184

    accuracy                           0.75       943
   macro avg       0.54      0.52      0.52       943
weighted avg       0.70      0.75      0.72       943



In [75]:
# Calculate feature importances
importances = rf_model.feature_importances_
importances


array([0.45103533, 0.23333384, 0.30413241, 0.00543357, 0.00606485])

In [76]:
# Sort the feature importances
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.4510353253410397, 'age'),
 (0.3041324070208737, 'country'),
 (0.23333384022765527, 'category'),
 (0.006064853304539368, 'gender_M'),
 (0.005433574105892045, 'gender_F')]