In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

from pyspark import SparkFiles
url="https://qos-bucket.s3.us-east-2.amazonaws.com/credit_default.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("credit_default.csv"), sep=",", header=True, inferSchema=True)

df = df.select("*").toPandas()


df = df.dropna()
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [0]:
#testing with limited features based on random forest feature importance.
X = df.drop(['default payment next month', 'ID', 'SEX', 'MARRIAGE', 'PAY_6', 'EDUCATION', 'PAY_5', 'PAY_4', 'PAY_3'], axis=1)
y = df["default payment next month"]

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(f'CLF test score: {clf.score(X_test, y_test)}')

CLF test score: 0.7305333333333334


In [0]:
#scaling data
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train.values.reshape(-1, 1))
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [6]:
#testing with scaled data
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train)
print(f'CLF scaled test score: {clf.score(X_test_scaled, y_test)}')

CLF scaled test score: 0.7349333333333333


In [7]:
import pandas as pd
classifier = ['Random Forest', 'ERT', 'Log-Reg', 'KNN', 'Decision Tree']
score = [0.8151, 0.8109, 0.7767, 0.7736, 0.7349]

pd.DataFrame({
  'Classifier': classifier,
  'Score': score
}).groupby(['Classifier']).mean().sort_values(by='Score', ascending=False)

Unnamed: 0_level_0,Score
Classifier,Unnamed: 1_level_1
Random Forest,0.8151
ERT,0.8109
Log-Reg,0.7767
KNN,0.7736
Decision Tree,0.7349
