In [1]:
#Importing libraries

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv("/home/sarah/Phase3/Phase3 Project/Data/data-tz.csv", index_col=0)

In [3]:
df

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,3/14/2011,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,3/6/2013,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2/25/2013,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,1/28/2013,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,7/13/2011,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,10.0,5/3/2013,Germany Republi,1210,CES,37.169807,-3.253847,Area Three Namba 27,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
59396,27263,4700.0,5/7/2011,Cefa-njombe,1212,Cefa,35.249991,-9.070629,Kwa Yahona Kuvala,0,...,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,functional
59397,37057,0.0,4/11/2011,not_provided,0,Not_provided,34.017087,-8.750434,Mashine,0,...,fluoride,fluoride,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,functional
59398,31282,0.0,3/8/2011,Malec,0,Musa,35.861315,-6.378573,Mshoro,0,...,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,functional


In [4]:
#Dropping columns  that are irrelavant
data1= df.drop(columns  = ['id','amount_tsh',  'num_private', 'date_recorded', 
          'quantity_group', 'recorded_by','payment_type', 
          'waterpoint_type_group',  'subvillage', 'wpt_name',
            'funder', 'installer', 'ward'], axis = 1)
data1.head()

Unnamed: 0,gps_height,longitude,latitude,basin,region,region_code,district_code,lga,population,public_meeting,...,management_group,payment,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type,status_group
0,1390,34.938093,-9.856322,Lake Nyasa,Iringa,11,5,Ludewa,109,True,...,user-group,pay annually,soft,good,enough,spring,spring,groundwater,communal standpipe,functional
1,1399,34.698766,-2.147466,Lake Victoria,Mara,20,2,Serengeti,280,False,...,user-group,never pay,soft,good,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,functional
2,686,37.460664,-3.821329,Pangani,Manyara,21,4,Simanjiro,250,True,...,user-group,pay per bucket,soft,good,enough,dam,dam,surface,communal standpipe multiple,functional
3,263,38.486161,-11.155298,Ruvuma / Southern Coast,Mtwara,90,63,Nanyumbu,58,True,...,user-group,never pay,soft,good,dry,machine dbh,borehole,groundwater,communal standpipe multiple,non functional
4,0,31.130847,-1.825359,Lake Victoria,Kagera,18,1,Karagwe,0,True,...,other,never pay,soft,good,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,functional


### Convert Target variable to Numerical variable

In [5]:
target_variable = {'functional':0, 
                   'non functional': 2, 
                   'functional needs repair': 1} 
data1['status_group'] = data1['status_group'].replace(target_variable)

In [6]:
# look at the values count in status group
data1.status_group.value_counts()

0    32054
2    22659
1     4316
Name: status_group, dtype: int64

### Converting boolean of pubblic meeting and permit to 0 and 1

In [7]:
#convert True/False in public meeting column to 0-1
data1['public_meeting'] = data1['public_meeting'].astype(bool).astype(int)
data1['permit'] = data1['permit'].astype(bool).astype(int)

### Feature Set

In [8]:
# separating target feature from data
X =data1.drop(columns = ['status_group'])
y =data1['status_group']

In [9]:
# using one hot encoder convert categorical data to numerical
X = pd.get_dummies(X)
X.head()

Unnamed: 0,gps_height,longitude,latitude,region_code,district_code,population,public_meeting,permit,construction_year,basin_Internal,...,source_class_groundwater,source_class_surface,source_class_unknown,waterpoint_type_cattle trough,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other
0,1390,34.938093,-9.856322,11,5,109,1,0,1999,0,...,1,0,0,0,1,0,0,0,0,0
1,1399,34.698766,-2.147466,20,2,280,0,1,2010,0,...,0,1,0,0,1,0,0,0,0,0
2,686,37.460664,-3.821329,21,4,250,1,1,2009,0,...,0,1,0,0,0,1,0,0,0,0
3,263,38.486161,-11.155298,90,63,58,1,1,1986,0,...,1,0,0,0,0,1,0,0,0,0
4,0,31.130847,-1.825359,18,1,0,1,1,0,0,...,0,1,0,0,1,0,0,0,0,0


In [10]:
#Normalize the data
X = preprocessing.StandardScaler().fit(X).transform(X.astype(float))

#### Train-Test Split data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print('Training set: ', X_train.shape, y_train.shape)
print('Testing set: ', X_test.shape, y_test.shape)

Training set:  (44271, 283) (44271,)
Testing set:  (14758, 283) (14758,)


#### Training the Model


In [12]:
with_k2 = DecisionTreeClassifier(criterion='entropy', max_depth=4).fit(X_train, y_train)
with_k2

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

#### Predicting

In [13]:
y_pred = with_k2.predict(X_test)

#### Evaluation of Accuracy

In [14]:
print('Train set Accuracy: ', metrics.accuracy_score(y_train, with_k2.predict(X_train)))
print('Test set Accuracy: ', metrics.accuracy_score(y_test, y_pred))

Train set Accuracy:  0.7031691174809694
Test set Accuracy:  0.6939287166282694


### Decision Tree Visualization

In [15]:
#pip install pydotplus

In [16]:
import pydotplus 
from sklearn.externals.six import StringIO 
#from sklearn.externals.six import StringIO
from sklearn import tree
from sklearn.metrics import confusion_matrix



In [17]:
arr = X.to_numpy()

AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [18]:
dot_data = StringIO()
filename = 'pumptree.png'
featureNames = X.columns[0:196]
targetNames = y
out = tree.export_graphviz(with_k2, feature_names=featureNames, 
                           out_file=dot_data, 
                           class_names=np.unique(y_train), 
                           filled=True, 
                           special_characters=True, 
                           rotate=False)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

### Random Forest Classifier¶


In [19]:
#Random Forest
RFC = RandomForestClassifier(criterion='entropy', max_depth=None).fit(X_train, y_train)

In [20]:
#Predicting X Test
yhatRF = RFC.predict(X_test)

In [21]:
#Checking for Accuracy of the model
print('Train set Accuracy: ', metrics.accuracy_score(y_train, RFC.predict(X_train)))
print('Test set Accuracy: ', metrics.accuracy_score(y_test, yhatRF))

Train set Accuracy:  0.9951661358451357
Test set Accuracy:  0.7941455481772598


### Grid Search cross validation

In [None]:
%%time
RFC_params = {'n_estimators':[2,5,10,20,50,75,150],
              'criterion':['gini', 'entropy'],
              'max_depth':[2,5,10,20,50,None],
              'min_samples_split':[2,5,10,20]}


grid_RFC = GridSearchCV(RandomForestClassifier(), 
                        RFC_params, 
                        cv=5, 
                        scoring='accuracy').fit(X_train, y_train)