In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , FunctionTransformer
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix, recall_score, accuracy_score, precision_score, f1_score

In [2]:
df = pd.read_csv("data/data_clean.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,funder,gps_height,installer,wpt_name,basin,region_code,district_code,population,public_meeting,...,extraction_type_class,management,payment_type,quality_group,quantity,source,source_class,waterpoint_type,status_group,construction_year_bins
0,0,Roman,1390,Roman,none,Lake Nyasa,11,5,109,True,...,gravity,vwc,annually,good,enough,spring,groundwater,communal standpipe,functional,"(1990.0, 2000.0]"
1,1,Grumeti,1399,GRUMETI,Zahanati,Lake Victoria,20,2,280,,...,gravity,wug,never pay,good,insufficient,rainwater harvesting,surface,communal standpipe,functional,"(2000.0, 2010.0]"
2,2,Lottery Club,686,World vision,Kwa Mahundi,Pangani,21,4,250,True,...,gravity,vwc,per bucket,good,enough,dam,surface,communal standpipe multiple,functional,"(2000.0, 2010.0]"
3,3,Unicef,263,UNICEF,Zahanati Ya Nanyumbu,Ruvuma / Southern Coast,90,63,58,True,...,submersible,vwc,never pay,good,dry,machine dbh,groundwater,communal standpipe multiple,non functional,"(1980.0, 1990.0]"
4,4,Action In A,0,Artisan,Shuleni,Lake Victoria,18,1,0,True,...,gravity,other,never pay,good,seasonal,rainwater harvesting,surface,communal standpipe,functional,"(1980.0, 1990.0]"


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              59400 non-null  int64 
 1   funder                  55765 non-null  object
 2   gps_height              59400 non-null  int64 
 3   installer               55745 non-null  object
 4   wpt_name                59400 non-null  object
 5   basin                   59400 non-null  object
 6   region_code             59400 non-null  int64 
 7   district_code           59400 non-null  int64 
 8   population              59400 non-null  int64 
 9   public_meeting          56066 non-null  object
 10  permit                  56344 non-null  object
 11  extraction_type_class   59400 non-null  object
 12  management              59400 non-null  object
 13  payment_type            59400 non-null  object
 14  quality_group           59400 non-null  object
 15  qu

In [4]:
df = df.drop("Unnamed: 0",axis=1)

In [5]:
def int_to_object(df):
    df["region_code"] = df["region_code"].astype(object)
    df["district_code"] = df["district_code"].astype(object)
    return df.dtypes

In [6]:
int_to_object(df)

funder                    object
gps_height                 int64
installer                 object
wpt_name                  object
basin                     object
region_code               object
district_code             object
population                 int64
public_meeting            object
permit                    object
extraction_type_class     object
management                object
payment_type              object
quality_group             object
quantity                  object
source                    object
source_class              object
waterpoint_type           object
status_group              object
construction_year_bins    object
dtype: object

In [7]:
X = df.drop('status_group', axis=1)
y = df['status_group']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [9]:
def grab_numeric(df):
    return df.select_dtypes(include=['int64'])

In [10]:
GrabNumeric = FunctionTransformer(grab_numeric)

In [11]:
pipe= Pipeline(steps=[("num", GrabNumeric),
                      ( "ss", StandardScaler())])

In [12]:
pipe.fit(X_train)

Pipeline(steps=[('num',
                 FunctionTransformer(func=<function grab_numeric at 0x124db60d0>)),
                ('ss', StandardScaler())])

In [13]:
pipe.transform(X_train)

array([[-0.48786917, -0.20973197],
       [-0.96404191, -0.37956688],
       [-0.96404191, -0.37956688],
       ...,
       [-0.98280023,  1.74336955],
       [-0.96404191, -0.37956688],
       [ 0.89736062, -0.37744395]])

In [14]:
j = -1
for i in X_train.columns:
    j+=1
    print("Index Number", j,i)

Index Number 0 funder
Index Number 1 gps_height
Index Number 2 installer
Index Number 3 wpt_name
Index Number 4 basin
Index Number 5 region_code
Index Number 6 district_code
Index Number 7 population
Index Number 8 public_meeting
Index Number 9 permit
Index Number 10 extraction_type_class
Index Number 11 management
Index Number 12 payment_type
Index Number 13 quality_group
Index Number 14 quantity
Index Number 15 source
Index Number 16 source_class
Index Number 17 waterpoint_type
Index Number 18 construction_year_bins


In [15]:
# We'll throw these mini-pipelines into our ColumnTransformer.

subpipe_num = Pipeline(steps=[('num_impute', SimpleImputer(strategy="median")),
                           ('ss', StandardScaler())])
subpipe_cat = Pipeline(steps=[('cat_impute', SimpleImputer(strategy='most_frequent')),
                             ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [16]:
CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, [1,7]),
                                         ('subpipe_cat', subpipe_cat, [4,5,6,8,9,11,12,13,14,15,16,17,18])])

In [17]:
dt_model_pipe = Pipeline(steps=([("ct", CT), 
                                    ("dt", DecisionTreeClassifier(random_state = 0))]))
dt_model_pipe.fit(X_train, y_train)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  [1, 7]),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                          

In [19]:
dt_grid = [{'dt__max_depth': [2, 4, 6], 
         'dt__min_samples_leaf': [20, 25]}]


gridsearch = GridSearchCV(estimator=dt_model_pipe, 
                          param_grid=dt_grid, 
                          scoring='accuracy', 
                          cv=5)

gridsearch.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         [1,
                                                                          7]),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('cat_impute',
                                                               

In [20]:
gridsearch.score(X_test, y_test)

0.7144781144781145

In [21]:
gridsearch.best_params_

{'dt__max_depth': 6, 'dt__min_samples_leaf': 20}

In [22]:
gridsearch.cv_results_['mean_test_score']

array([0.6935578 , 0.6935578 , 0.70181818, 0.70181818, 0.71548822,
       0.71548822])

In [25]:
df['top_funders'] = pd.DataFrame(df['funder'].value_counts().sort_values(ascending=False).iloc[:6])

In [26]:
df['top_funders'].value_counts()

Series([], Name: top_funders, dtype: int64)

In [34]:
df[df['funder'].value_counts() >= '1349']

TypeError: Invalid comparison between dtype=int64 and str

In [None]:
if funders in df['top_funders'].values