In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# connect the PostgreSQL database to the model 
# from sqlalchemy import create_engine

## replace user, password, host, port, database, and mytable with local values
# engine = create_engine('postgresql://user:password@host:port/database')

# df = pd.read_sql_query('SELECT * FROM mytable', con=engine)
# df

In [3]:
#### Section 1 ################################################
###############################################################
#### Logistic Regression model ################################
#### 2022 Forbes List Data Only ###############################

In [4]:
# Load the data
df = pd.read_csv("../Billionaire_Analysis/Resources/cleaned_billionaire.csv")
df.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Elon Musk,50,219000,16,26,0,1
1,2,Jeff Bezos,58,171000,11,26,0,1
2,3,Bernard Arnault & family,73,158000,17,40,0,1
3,4,Bill Gates,66,129000,11,26,0,1
4,5,Warren Buffett,91,118000,14,26,0,1


In [5]:
# List comprehension to add a new column
## Values in column are 1 if the values in "finalWorth" is greater than 4799 and 0 if less than 4799
## $4799.09 is the mean finalWorth amount. Rounded to 4799.
df["finalWorth>4799"] = ["1" if value > 4799 else "0" for value in df["finalWorth"]]
df.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M,finalWorth>4799
0,1,Elon Musk,50,219000,16,26,0,1,1
1,2,Jeff Bezos,58,171000,11,26,0,1,1
2,3,Bernard Arnault & family,73,158000,17,40,0,1,1
3,4,Bill Gates,66,129000,11,26,0,1,1
4,5,Warren Buffett,91,118000,14,26,0,1,1


In [6]:
# Create the feature matrix and the target vector
features = ["rank", "age", "category", "country",
          "gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [7]:
# Create X and y variables
# Create our features
X = df.loc[:, features].copy()

# Create our target
## This calls the target variable from ln[4] 
## and will locate and copy all values within the "finalWorth" column
## Then reshapes y into a flat 1D array
y = df.loc[:, target].copy()
y = y.values.reshape(-1)


In [8]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [9]:
# Create a model object
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs',random_state=1)


In [10]:
from sklearn.preprocessing import StandardScaler
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [11]:
# Fit the model
model.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

In [12]:
# Predict with the model
y_pred = model.predict(X_test_scaled)


In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, y_pred)
acc_score = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[385   4]
 [  0 127]]
Accuracy Score: 0.9922480620155039


In [14]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, 
                     index=["Predicted >4,799", "Predicted <4,799"], 
                     columns=["Actual >4,799", "Actual <4,799"])

cm_df


Unnamed: 0,"Actual >4,799","Actual <4,799"
"Predicted >4,799",385,4
"Predicted <4,799",0,127


In [15]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.9922480620155039


In [16]:
#### Section 2 ################################################
###############################################################
#### Add more data to the Logistic Regression model ###########
#### 2022 & 2018 Forbes List Data #############################


In [17]:
# Add the extra data
df2 = pd.read_csv("../Billionaire_Analysis/Resources/cleaned_2018_billionaire.csv")
df2.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Jeff Bezos,54,112000,8,20,0,1
1,2,Bill Gates,62,90000,8,20,0,1
2,3,Warren Buffett,87,84000,10,20,0,1
3,4,Bernard Arnault,69,72000,17,36,0,1
4,5,Mark Zuckerberg,33,71000,8,20,0,1


In [18]:
# List comprehension to add a new column
## Values in column are 1 if the values in "finalWorth" is greater than 4799 and 0 if less than 4799
## $4799.09 is the mean finalWorth amount. Rounded to 4799.
df2["finalWorth>4799"] = ["1" if value > 4799 else "0" for value in df2["finalWorth"]]
df2.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M,finalWorth>4799
0,1,Jeff Bezos,54,112000,8,20,0,1,1
1,2,Bill Gates,62,90000,8,20,0,1,1
2,3,Warren Buffett,87,84000,10,20,0,1,1
3,4,Bernard Arnault,69,72000,17,36,0,1,1
4,5,Mark Zuckerberg,33,71000,8,20,0,1,1


In [19]:
# target and features stay the same
features = ["rank", "age", "category", "country","gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [20]:
# Join the 2018 and 2022 datasets together
## Since this is joining two lists together, "rank" and "personName" may have repeating values
df_2022_2018 = pd.concat([df,df2])
df_2022_2018


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M,finalWorth>4799
0,1,Elon Musk,50,219000,16,26,0,1,1
1,2,Jeff Bezos,58,171000,11,26,0,1,1
2,3,Bernard Arnault & family,73,158000,17,40,0,1,1
3,4,Bill Gates,66,129000,11,26,0,1,1
4,5,Warren Buffett,91,118000,14,26,0,1,1
...,...,...,...,...,...,...,...,...,...
2130,2124,Zhao Xiaoqiang,50,1000,17,26,0,1,0
2131,2124,Zhou Liangzhang,55,1000,12,26,0,1,0
2132,2124,Zhu Xingming,51,1000,12,26,0,1,0
2133,2124,Zhuo Jun,52,1000,12,1,1,0,0


In [21]:
# Create X and y variables
# Create our features
X = df_2022_2018.loc[:, features].copy()

# Create our target
y = df_2022_2018.loc[:, target].copy()
y = y.values.reshape(-1)


In [22]:
# Split the data into training and testing. Updated test_size to 0.3.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [23]:
# Create the model object
model = LogisticRegression(solver='lbfgs',random_state=1)


In [24]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [25]:
# Fit the model
model.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

In [26]:
# Predict with the model
y_pred = model.predict(X_test_scaled)


In [27]:
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, y_pred)
acc_score = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[740  12]
 [ 24 167]]
Accuracy Score: 0.9618239660657476


In [28]:
# Scale the Xtraining data and show shape to prove it is now a 1D object
X_train_scaled = np.arange(0,len(X_train),1)
X_train_scaled.shape


(3768,)

In [29]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.9618239660657476


In [30]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, 
                     index=["Predicted >4,799", "Predicted <4,799"], 
                     columns=["Actual >4,799", "Actual <4,799"])

cm_df


Unnamed: 0,"Actual >4,799","Actual <4,799"
"Predicted >4,799",740,12
"Predicted <4,799",24,167


In [31]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9618239660657476


In [32]:
#### Section 3 ################################################
###############################################################
#### We have to remove rank to see if the other features can ## 
#### hold up the model on their own ###########################
#### Restart the Logistic Regression model ####################
#### 2022 Forbes List Data Only ###############################


In [33]:
# Create the feature matrix and the target vector
features = ["age", "category", "country",
          "gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [34]:
# Create X and y variables
# Create our features
X = df.loc[:, features].copy()

# Create our target
## This calls the target variable from ln[4] 
## and will locate and copy all values within the "finalWorth" column
## Then reshapes y into a flat 1D array
y = df.loc[:, target].copy()
y = y.values.reshape(-1)


In [35]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [36]:
# Create a model object
model = LogisticRegression(solver='lbfgs',random_state=1)


In [37]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [38]:
# Fit the model
model.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

In [39]:
# Predict with the model
y_pred = model.predict(X_test_scaled)


In [40]:
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, y_pred)
acc_score = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[400   0]
 [116   0]]
Accuracy Score: 0.7751937984496124


In [41]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, 
                     index=["Predicted >4,799", "Predicted <4,799"], 
                     columns=["Actual >4,799", "Actual <4,799"])

cm_df


Unnamed: 0,"Actual >4,799","Actual <4,799"
"Predicted >4,799",400,0
"Predicted <4,799",116,0


In [42]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.7751937984496124


In [43]:
#### Section 4 ################################################
###############################################################
#### We have to remove rank to see if the other features can ##
#### hold up the model on their own ###########################
#### Restart the Logistic Regression model ####################
#### 2022 & 2018 Forbes List Data #############################


In [44]:
# Create the feature matrix and the target vector
features = ["age", "category", "country",
          "gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [45]:
# Create X and y variables
# Create our features
X = df_2022_2018.loc[:, features].copy()

# Create our target
y = df_2022_2018.loc[:, target].copy()
y = y.values.reshape(-1)


In [46]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [47]:
# Create a model object
model = LogisticRegression(solver='lbfgs',random_state=1)


In [48]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [49]:
# Fit the model
model.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

In [50]:
# Predict with the model
y_pred = model.predict(X_test_scaled)


In [51]:
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, y_pred)
acc_score = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[724   0]
 [219   0]]
Accuracy Score: 0.767762460233298


In [52]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, 
                     index=["Predicted >4,799", "Predicted <4,799"], 
                     columns=["Actual >4,799", "Actual <4,799"])

cm_df


Unnamed: 0,"Actual >4,799","Actual <4,799"
"Predicted >4,799",724,0
"Predicted <4,799",219,0


In [53]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.767762460233298


In [54]:
#### Section 5 ################################################
###############################################################
#### We can use a random forest ensemble model to rank the ####
#### importance of the features ###############################
###############################################################
#### Random Forest Ensemble Model #############################
#### 2022 Forbes List Data Only ###############################


In [55]:
# Create the feature matrix and the target vector
features = ["age", "category", "country",
          "gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [56]:
# Create X and y variables
# Create our features
X = df.loc[:, features].copy()

# Create our target
## This calls the target variable from ln[4] 
## and will locate and copy all values within the "finalWorth" column
## Then reshapes y into a flat 1D array
y = df.loc[:, target].copy()
y = y.values.reshape(-1)


In [57]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [58]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier model object
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 


In [59]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [60]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)


In [61]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)


In [62]:
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[354  39]
 [ 98  25]]
Accuracy Score: 0.7344961240310077


In [63]:
# print the classification report
print("Classification Report")
print(classification_report(y_test, predictions))


Classification Report
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       393
           1       0.39      0.20      0.27       123

    accuracy                           0.73       516
   macro avg       0.59      0.55      0.55       516
weighted avg       0.69      0.73      0.70       516



In [64]:
# Calculate feature importances
importances = rf_model.feature_importances_
importances


array([0.51295384, 0.21451653, 0.25613757, 0.00846315, 0.00792891])

In [65]:
# Sort the feature importances
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.5129538409371424, 'age'),
 (0.2561375724165953, 'country'),
 (0.21451652782755415, 'category'),
 (0.008463152840888584, 'gender_F'),
 (0.00792890597781958, 'gender_M')]

In [66]:
#### Section 6 ################################################
#### We can use a random forest ensemble model to rank the ####
#### importance of the features ###############################
###############################################################
#### Random Forest Ensemble Model #############################
#### 2022 & 2018 Forbes List Data #############################


In [67]:
# Create the feature matrix and the target vector
features = ["age", "category", "country",
          "gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [68]:
# Create X and y variables
# Create our features
X = df_2022_2018.loc[:, features].copy()

# Create our target
y = df_2022_2018.loc[:, target].copy()
y = y.values.reshape(-1)


In [69]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [70]:
# Create a random forest classifier model object
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 


In [71]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [72]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)


In [73]:
# Make predictions using the testing data
predictions = rf_model.predict(X_test_scaled)


In [74]:
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[671  66]
 [175  31]]
Accuracy Score: 0.7444326617179216


In [75]:
# print the classification report
print("Classification Report")
print(classification_report(y_test, predictions))


Classification Report
              precision    recall  f1-score   support

           0       0.79      0.91      0.85       737
           1       0.32      0.15      0.20       206

    accuracy                           0.74       943
   macro avg       0.56      0.53      0.53       943
weighted avg       0.69      0.74      0.71       943



In [76]:
# Calculate feature importances
importances = rf_model.feature_importances_
importances


array([0.44677128, 0.23905605, 0.30253043, 0.00550661, 0.00613563])

In [77]:
# Sort the feature importances
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.4467712752803171, 'age'),
 (0.3025304324280394, 'country'),
 (0.23905604971984373, 'category'),
 (0.006135633149114144, 'gender_M'),
 (0.00550660942268554, 'gender_F')]