In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# Load the data
df = pd.read_csv("../Billionaire_Analysis/Resources/cleaned_billionaire.csv")
df.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Elon Musk,50,219000,16,26,0,1
1,2,Jeff Bezos,58,171000,11,26,0,1
2,3,Bernard Arnault & family,73,158000,17,40,0,1
3,4,Bill Gates,66,129000,11,26,0,1
4,5,Warren Buffett,91,118000,14,26,0,1


In [3]:
# List comprehension to add a new column
## Values in column are 1 if the values in "finalWorth" is greater than 4799 and 0 if less than 4799
## $4799.09 is the mean finalWorth amount. Rounded to 4799.
df["finalWorth>4799"] = ["1" if value > 4799 else "0" for value in df["finalWorth"]]
df.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M,finalWorth>4799
0,1,Elon Musk,50,219000,16,26,0,1,1
1,2,Jeff Bezos,58,171000,11,26,0,1,1
2,3,Bernard Arnault & family,73,158000,17,40,0,1,1
3,4,Bill Gates,66,129000,11,26,0,1,1
4,5,Warren Buffett,91,118000,14,26,0,1,1


In [4]:
# Create the feature matrix and the target vector
features = ["rank", "age", "category", "country",
          "gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [5]:
# Create X and y variables
# Create our features
X = df.loc[:, features].copy()

# Create our target
## This calls the target variable from ln[4] 
## and will locate and copy all values within the "finalWorth" column
## Then reshapes y into a flat 1D array
y = df.loc[:, target].copy()
y = y.values.reshape(-1)


In [6]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [7]:
# Create a model object
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs',random_state=1)


In [8]:
from sklearn.preprocessing import StandardScaler
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [9]:
# Fit the model
model.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

In [10]:
# Predict with the model
y_pred = model.predict(X_test_scaled)


In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, y_pred)
acc_score = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[388   1]
 [  2 125]]
Accuracy Score: 0.9941860465116279


In [12]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, 
                     index=["Predicted >4,799", "Predicted <4,799"], 
                     columns=["Actual >4,799", "Actual <4,799"])

cm_df


Unnamed: 0,"Actual >4,799","Actual <4,799"
"Predicted >4,799",388,1
"Predicted <4,799",2,125


In [13]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.9941860465116279


In [14]:
###############################################################
#### Add more data to the Logistic Regression model ###########


In [15]:
# Add the extra data
df2 = pd.read_csv("../Billionaire_Analysis/Resources/cleaned_2018_billionaire.csv")
df2.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M
0,1,Jeff Bezos,54,112000,8,20,0,1
1,2,Bill Gates,62,90000,8,20,0,1
2,3,Warren Buffett,87,84000,10,20,0,1
3,4,Bernard Arnault,69,72000,17,36,0,1
4,5,Mark Zuckerberg,33,71000,8,20,0,1


In [16]:
# List comprehension to add a new column
## Values in column are 1 if the values in "finalWorth" is greater than 4799 and 0 if less than 4799
## $4799.09 is the mean finalWorth amount. Rounded to 4799.
df2["finalWorth>4799"] = ["1" if value > 4799 else "0" for value in df2["finalWorth"]]
df2.head()


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M,finalWorth>4799
0,1,Jeff Bezos,54,112000,8,20,0,1,1
1,2,Bill Gates,62,90000,8,20,0,1,1
2,3,Warren Buffett,87,84000,10,20,0,1,1
3,4,Bernard Arnault,69,72000,17,36,0,1,1
4,5,Mark Zuckerberg,33,71000,8,20,0,1,1


In [17]:
# target and features stay the same
features = ["rank", "age", "category", "country","gender_F", "gender_M"]

target = ["finalWorth>4799"]


In [18]:
# Join the 2018 and 2022 datasets together
## Since this is joining two lists together, "rank" and "personName" may have repeating values
df_2022_2018 = pd.concat([df,df2])
df_2022_2018


Unnamed: 0,rank,personName,age,finalWorth,category,country,gender_F,gender_M,finalWorth>4799
0,1,Elon Musk,50,219000,16,26,0,1,1
1,2,Jeff Bezos,58,171000,11,26,0,1,1
2,3,Bernard Arnault & family,73,158000,17,40,0,1,1
3,4,Bill Gates,66,129000,11,26,0,1,1
4,5,Warren Buffett,91,118000,14,26,0,1,1
...,...,...,...,...,...,...,...,...,...
2130,2124,Zhao Xiaoqiang,50,1000,17,26,0,1,0
2131,2124,Zhou Liangzhang,55,1000,12,26,0,1,0
2132,2124,Zhu Xingming,51,1000,12,26,0,1,0
2133,2124,Zhuo Jun,52,1000,12,1,1,0,0


In [19]:
# Create X and y variables
# Create our features
X = df_2022_2018.loc[:, features].copy()

# Create our target
y = df_2022_2018.loc[:, target].copy()
y = y.values.reshape(-1)


In [20]:
# Split the data into training and testing. Updated test_size to 0.3.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


In [21]:
# Create the model object
model = LogisticRegression(solver='lbfgs',random_state=1)


In [22]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [23]:
# Fit the model
model.fit(X_train_scaled, y_train)


LogisticRegression(random_state=1)

In [24]:
# Predict with the model
y_pred = model.predict(X_test_scaled)


In [25]:
# calculate the confusion matrix and accuracy score
cm = confusion_matrix(y_test, y_pred)
acc_score = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Accuracy Score:", acc_score)


Confusion Matrix:
 [[1118   25]
 [  24  247]]
Accuracy Score: 0.9653465346534653


In [26]:
# Scale the Xtraining data and show shape to prove it is now a 1D object
X_train_scaled = np.arange(0,len(X_train),1)
X_train_scaled.shape


(3297,)

In [27]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.9653465346534653


In [28]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, 
                     index=["Predicted >4,799", "Predicted <4,799"], 
                     columns=["Actual >4,799", "Actual <4,799"])

cm_df


Unnamed: 0,"Actual >4,799","Actual <4,799"
"Predicted >4,799",1118,25
"Predicted <4,799",24,247


In [29]:
# Evaluate the model on the test data
accuracy = model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9653465346534653
