In [1]:
import pandas as pd
import graphviz
from sklearn.model_selection import train_test_split
from sklearn import tree, metrics

# Read Data
# 1.Read CSV File
raw_data = pd.read_csv('1061-Data-Mining/HW1_Decision_Tree/character-deaths.csv')

# Data Preprocessing
# 2-1.Replace NaN with 0
filled_data = raw_data.fillna(0)
# 2-2.If 'Death Year' has a value set it to 1 and rename the column to 'is_Dead'
mask = filled_data['Death Year'] > 0
is_Dead = 1 * mask
is_Dead.name = 'is_Dead'
# 2-3.Transform 'Allegiances' to dummy variables
final_data = filled_data.join(pd.get_dummies(filled_data['Allegiances'], prefix = 'is'))
# Cleaning Redundant or Unused Attributes
final_data = final_data.drop('Name', 1)
final_data = final_data.drop('Book of Death', 1)
final_data = final_data.drop('Death Chapter', 1)
final_data = final_data.drop('Death Year', 1)
final_data = final_data.drop('Allegiances', 1)
# Spliting Training and Teating Data
trainX, testX, trainY, testY = train_test_split(final_data, is_Dead, test_size = 0.25)

# Build Model
# 3.Build decision tree
clf = tree.DecisionTreeClassifier(max_depth = 8)
clf = clf.fit(trainX, trainY)

# Calculate Scores
# 4.Calculate the precision rate, recall rate, accuracy
testY_predicted = clf.predict(testX)
precision = metrics.precision_score(testY, testY_predicted)
recall = metrics.recall_score(testY, testY_predicted)
accuracy = metrics.accuracy_score(testY, testY_predicted)
print('Precision Rate:', precision * 100, '%')
print('Recall Rate:', recall * 100, '%')
print('Accuracy:', accuracy * 100, '%')

# Data Visualization
# 5.Generate Tree Image
target_names = ['Alive', 'Dead']
result = tree.export_graphviz(clf, out_file = None, feature_names = testX.columns, class_names = target_names, filled = True, rounded = True, special_characters = True)  
graph = graphviz.Source(result)  
graph.render('DT_result', view = True)

Precision Rate: 64.2857142857 %
Recall Rate: 32.5301204819 %
Accuracy: 69.1304347826 %


'DT_result.pdf'

In [2]:
from pyspark import sql
from pyspark.sql.types import *
from pyspark.sql import functions as F
from numpy import array
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.evaluation import MulticlassMetrics

In [3]:
# Read Data
df = spark.read.csv('1061-Data-Mining/HW1_Decision_Tree/character-deaths.csv', header=True)
# Displays the content of the DataFrame to stdout
# df.show()
# Drop Unneeded Columns
df = df.drop('Name', 'Book of Death', 'Death Chapter')
# Fill null value with 0
df = df.fillna({'Death Year': 0, 'Book Intro Chapter': 0, 'Gender': 0, 'Nobility': 0, 'GoT': 0, 'CoK': 0, 'SoS': 0, 'FfC': 0, 'DwD': 0})

In [4]:
# Generate is_Dead columns
isDead_expr = [F.when(F.col('Death Year') > 0, 1).otherwise(0).alias('is_Dead')]
df = df.select('Allegiances', 'Book Intro Chapter', 'Gender', 'Nobility', 'GoT', 'CoK', 'SoS', 'FfC', 'DwD', * isDead_expr)
df.show()

+---------------+------------------+------+--------+---+---+---+---+---+-------+
|    Allegiances|Book Intro Chapter|Gender|Nobility|GoT|CoK|SoS|FfC|DwD|is_Dead|
+---------------+------------------+------+--------+---+---+---+---+---+-------+
|      Lannister|                56|     1|       1|  1|  1|  1|  1|  0|      0|
|           None|                49|     1|       1|  0|  0|  1|  0|  0|      1|
|House Targaryen|                 5|     1|       1|  0|  0|  0|  0|  1|      0|
|  House Greyjoy|                20|     1|       1|  0|  0|  0|  0|  1|      1|
|      Lannister|                 0|     1|       1|  0|  0|  1|  0|  0|      0|
|      Baratheon|                 0|     1|       1|  0|  1|  1|  0|  0|      0|
|  Night's Watch|                21|     1|       1|  1|  0|  1|  1|  0|      1|
|           None|                59|     0|       1|  1|  1|  1|  0|  1|      1|
|  House Greyjoy|                11|     1|       1|  0|  1|  0|  1|  0|      0|
|  Night's Watch|           

In [5]:
# Convert columns types

def convertColumn(df, colNames, newType):
    for name in colNames:
        df = df.withColumn(name, df[name].astype(newType))
    return df


df = convertColumn(df, df.columns[2:], IntegerType())
df = df.withColumn(df.columns[1], df[df.columns[1]].astype(FloatType()))
df.dtypes

[('Allegiances', 'string'),
 ('Book Intro Chapter', 'float'),
 ('Gender', 'int'),
 ('Nobility', 'int'),
 ('GoT', 'int'),
 ('CoK', 'int'),
 ('SoS', 'int'),
 ('FfC', 'int'),
 ('DwD', 'int'),
 ('is_Dead', 'int')]

In [6]:
# Get Dummies

allegiances = df.select('Allegiances').distinct().rdd.flatMap(lambda x: x).collect()
allegiances_expr = [F.when(F.col('Allegiances') == a, 1).otherwise(0).alias('is_' + a) for a in allegiances]
df1 = df.select('Book Intro Chapter', 'Gender', 'Nobility', 'GoT', 'CoK', 'SoS', 'FfC', 'DwD', * allegiances_expr, 'is_Dead')
df1.dtypes

[('Book Intro Chapter', 'float'),
 ('Gender', 'int'),
 ('Nobility', 'int'),
 ('GoT', 'int'),
 ('CoK', 'int'),
 ('SoS', 'int'),
 ('FfC', 'int'),
 ('DwD', 'int'),
 ('is_House Stark', 'int'),
 ('is_None', 'int'),
 ('is_House Baratheon', 'int'),
 ('is_House Tully', 'int'),
 ('is_House Martell', 'int'),
 ('is_Greyjoy', 'int'),
 ('is_Arryn', 'int'),
 ('is_House Tyrell', 'int'),
 ('is_House Greyjoy', 'int'),
 ('is_House Targaryen', 'int'),
 ('is_Baratheon', 'int'),
 ('is_Lannister', 'int'),
 ('is_Targaryen', 'int'),
 ('is_Tully', 'int'),
 ('is_Martell', 'int'),
 ('is_Tyrell', 'int'),
 ('is_House Arryn', 'int'),
 ("is_Night's Watch", 'int'),
 ('is_Stark', 'int'),
 ('is_Wildling', 'int'),
 ('is_House Lannister', 'int'),
 ('is_Dead', 'int')]

In [7]:
# Read Pandas DataFrame
# df3 = spark.createDataFrame(final_data)

[('Book Intro Chapter', 'double'),
 ('Gender', 'bigint'),
 ('Nobility', 'bigint'),
 ('GoT', 'bigint'),
 ('CoK', 'bigint'),
 ('SoS', 'bigint'),
 ('FfC', 'bigint'),
 ('DwD', 'bigint'),
 ('is_Arryn', 'bigint'),
 ('is_Baratheon', 'bigint'),
 ('is_Greyjoy', 'bigint'),
 ('is_House Arryn', 'bigint'),
 ('is_House Baratheon', 'bigint'),
 ('is_House Greyjoy', 'bigint'),
 ('is_House Lannister', 'bigint'),
 ('is_House Martell', 'bigint'),
 ('is_House Stark', 'bigint'),
 ('is_House Targaryen', 'bigint'),
 ('is_House Tully', 'bigint'),
 ('is_House Tyrell', 'bigint'),
 ('is_Lannister', 'bigint'),
 ('is_Martell', 'bigint'),
 ("is_Night's Watch", 'bigint'),
 ('is_None', 'bigint'),
 ('is_Stark', 'bigint'),
 ('is_Targaryen', 'bigint'),
 ('is_Tully', 'bigint'),
 ('is_Tyrell', 'bigint'),
 ('is_Wildling', 'bigint')]

In [24]:
# Split Training & Testing datasets
trainData, testData = df1.randomSplit(weights=[0.75, 0.25])
df1.count() == trainData.count() + testData.count()

True

In [25]:
# Data Parsing
def parseDF(df):
    parsedData = []
    for row in df.collect():
        parsedData.append(LabeledPoint(row[-1], list(row[:-1])))
    return parsedData

train = parseDF(trainData)
test = parseDF(testData)

In [26]:
# Training
DTmodel = DecisionTree.trainClassifier(data = sc.parallelize(train), numClasses = 2, categoricalFeaturesInfo = {}, maxDepth = 8)
print(DTmodel)

DecisionTreeModel classifier of depth 8 with 149 nodes


In [27]:
# Print Out
print(DTmodel.toDebugString())

DecisionTreeModel classifier of depth 8 with 149 nodes
  If (feature 6 <= 0.0)
   If (feature 7 <= 0.0)
    If (feature 9 <= 0.0)
     If (feature 22 <= 0.0)
      If (feature 26 <= 0.0)
       If (feature 20 <= 0.0)
        If (feature 27 <= 0.0)
         If (feature 16 <= 0.0)
          Predict: 1.0
         Else (feature 16 > 0.0)
          Predict: 1.0
        Else (feature 27 > 0.0)
         If (feature 0 <= 30.0)
          Predict: 0.0
         Else (feature 0 > 30.0)
          Predict: 1.0
       Else (feature 20 > 0.0)
        Predict: 1.0
      Else (feature 26 > 0.0)
       If (feature 1 <= 0.0)
        If (feature 0 <= 17.0)
         If (feature 0 <= 8.0)
          Predict: 0.0
         Else (feature 0 > 8.0)
          Predict: 1.0
        Else (feature 0 > 17.0)
         Predict: 0.0
       Else (feature 1 > 0.0)
        If (feature 2 <= 0.0)
         Predict: 1.0
        Else (feature 2 > 0.0)
         If (feature 0 <= 52.0)
          Predict: 1.0
         Else (feature 0 

In [28]:
# Get Predictions
testRDD = sc.parallelize(test)
predictions = DTmodel.predict(testRDD.map(lambda x: x.features))
labels = sc.parallelize(testRDD.map(lambda y: y.label).collect())

In [29]:
# Constructing Confusion Matrix
# Compute raw scores on the test set
# predictionAndLabels = testRDD.map(lambda pl: (DTmodel.predict(pl.features), pl.label))
predictionAndLabels = predictions.zip(labels)
predictionAndLabels.collect()

[(1.0, 0.0),
 (0.0, 0.0),
 (1.0, 0.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (1.0, 0.0),
 (1.0, 0.0),
 (1.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (1.0, 0.0),
 (0.0, 0.0),
 (0.0, 1.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 0.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (0.0, 1.0),
 (1.0, 0.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 0.0),
 (0.0, 1.0),
 (0.0, 0.0),
 (1.0, 0.0),
 (0.0, 0.0),
 (0.0, 1.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (1.0, 0.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 1.0),
 (0.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 1.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (1.0, 1.0),
 (0.0, 0.0),
 (0.0, 0.0),

In [30]:
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
# Calculate accuracy, recall & precision
precision = metrics.precision(1.0)
recall = metrics.recall(1.0)
accuracy = metrics.accuracy

In [31]:
print('Precision:\t', precision * 100, '%')
print('Recall:\t\t', recall * 100, '%')
print('Accuracy:\t', accuracy * 100, '%')

Precision:	 52.083333333333336 %
Recall:		 64.1025641025641 %
Accuracy:	 68.51063829787233 %
