In [53]:
# Machine Learning Assingment 2: Build a classifier
# Author: C17433026 Philip Toolan
# Date: 14/12/2020

import pandas as pd
import numpy as np
import csv
from pandas import DataFrame
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn import tree
from sklearn import model_selection
from numpy import asarray

#Load in the training data
columnHeadings=['id', 'age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaing', 'pdays', 'previous', 'poutcome', 'y']
trainingData = pd.read_csv("trainingset.txt", header=None,names=columnHeadings, delimiter=",")

#The target feature has to be passed in as a seperate parameter
targetFeature = trainingData['y']

#Extract numerical features to allow for some preprocessing 
numericFeatures = ['age','balance','day','duration','campaing','pdays','previous']
numericArray = trainingData[numericFeatures]

#Drop the numerical features, 'id' as this doesnt help with any predictions, and 'y' as it is the target feature
categoricalArray = trainingData.drop(numericFeatures + ['id','y'],axis=1)

#Replace missing values with NA
categoricalArray.replace('?','NA')
categoricalArray.fillna( 'NA', inplace = True )

#Transpose into array of dictionaries
categoricalArray = categoricalArray.T.to_dict().values()
#Convert to numeric encoding
vectorizer = DictVectorizer( sparse = False )
vec_categoricalArray = vectorizer.fit_transform(categoricalArray) 
encoding_dictionary = vectorizer.vocabulary_
#Execute mapping
for k in sorted(encoding_dictionary.keys()):
    mapping = k + " : column " + str(encoding_dictionary[k]) + " = 1"

#Merge the categorical data back with the numerical data
trainArray = np.hstack((numericArray.values, vec_categoricalArray ))

#Create decision tree using entropy
decTreeModel = tree.DecisionTreeClassifier(criterion='entropy')
#Fit the model using the numeric representations of the training model
decTreeModel.fit(trainArray, targetFeature)

#Read in the queries.txt file to do the queries
testingData = pd.read_csv("queries.txt",header=None,names=columnHeadings, delimiter=",")
testingDataDF = pd.DataFrame.from_dict(testingData,orient="columns")

#Extract the numerical features again
testingData_num = testingDataDF[numericFeatures].values 

#Convert the categorical features
testingData_cat = testingDataDF.drop(numericFeatures,axis=1)
testingData_categoricalArray = testingData_cat.T.to_dict().values()
testingData_vec_dfs = vectorizer.transform(testingData_categoricalArray) 

#Merge categorical features with numercial features again
testingDataQuery = np.hstack((testingData_num, testingData_vec_dfs ))

#Assign the predictions made to a list of predictions
predictionList = []
#2703 as this is the number of tests in the file
for i in range(2703):
    predictions = decTreeModel.predict([testingDataQuery[i]])
    #testingData.loc[i, 'y'] = targetFeature[predictions[i]]
    predictionList.append(predictions)

# attempt to get rid of raw predictions file    
# for index, row in testingData.iterrows():
#     predictions = decTreeModel.predict([testingDataQuery[index]])
#     predictionstring = predictions.tostring()
#     testingData.loc[index, 'y'] = y_category[predictionstring[index]]

#Add list of predictions to the y column in the testing data frame
testingData['y'] = predictionList
#Outout final result
testingData[['id','y']].to_csv("rawPredictions.txt", index = False, header = False)
   
#Replace [''] in raw predictions file and output predictions.txt
with open('rawPredictions.txt', 'r') as infile, \
     open('predictions.txt', 'w') as outfile:
    data = infile.read()
    data = data.replace("['", '"').replace("']", '"')
    outfile.write(data)


print('Predictions finished!')
testingData.tail()

Predictions finished!


Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaing,pdays,previous,poutcome,y
2698,TEST2699,28,JobCat6,single,tertiary,no,297,no,no,cellular,8,nov,0,1,-1,0,unknown,[TypeA]
2699,TEST2700,61,JobCat1,married,primary,no,182,no,no,cellular,9,nov,0,1,92,13,failure,[TypeA]
2700,TEST2701,31,JobCat11,single,secondary,no,690,no,no,cellular,10,nov,0,3,555,16,failure,[TypeA]
2701,TEST2702,27,JobCat6,single,tertiary,no,765,no,no,cellular,16,nov,0,1,-1,0,unknown,[TypeB]
2702,TEST2703,35,JobCat4,married,secondary,no,2971,no,no,cellular,17,nov,0,2,188,11,other,[TypeB]
