In [43]:
import numpy as np
import pandas as pd
from ML.HelperFunctionsMLClass import HelperFunctionsML
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### Read the dataset using pandas

In [44]:
temp = pd.read_csv("../tests/criminal_train.csv")

In [45]:
temp.AIIND102[-10:] = np.nan # setting some values to null

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [46]:
temp.columns[temp.isnull().sum()>0] 

Index(['AIIND102'], dtype='object')

In [47]:
temp.IFATHER = temp.IFATHER.astype("str")
temp.IFATHER[-10:] = None # setting some values to null

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Create an object of the class HelperFunctionsML 

In [48]:
obj = HelperFunctionsML(temp)

#### Alternatively
you can create the object like this 

```obj = HelperFunctionsML(pd.read_csv("../tests/criminal_train.csv"))```

In [49]:
obj.__doc__

'Helper functions for Machine Learning and EDA'

In [50]:
obj.nrows

45718

In [51]:
obj.ncols

72

In [52]:
obj.check_has_na_values()

True

In [53]:
obj.list_of_na_cols(dataset= temp)

['IFATHER', 'AIIND102']

In [54]:
obj.impute_categorical_cols()
obj.check_has_na_values()

True

In [55]:
obj.impute_numeric_cols()
obj.check_has_na_values()

False

In [56]:
obj.create_train_test_split()

target not set, call the function set_target with the name of the target column


In [57]:
obj.set_target("NRCH17_2")

In [58]:
X_train, X_test, y_train, y_test = obj.create_train_test_split(return_frames=True)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [59]:
obj.target

'NRCH17_2'

In [60]:
obj.nrows

45718

In [61]:
obj.nrowvalidation

13716

In [62]:
X_train.shape

(32002, 71)

In [63]:
obj.ncolstrain

71

In [64]:
obj.ncolsvalidation

71

In [65]:
obj.target

'NRCH17_2'

In [66]:
obj.set_target_type(col_type = "str")

In [67]:
log_reg = LogisticRegression()
dtree = DecisionTreeClassifier()

In [68]:
model_performance = obj.apply_model_predict_validate(model_obj = log_reg, feature_names=obj.cat_num_extract()["num_cols"])

  y = column_or_1d(y, warn=True)


In [69]:
print(model_performance)

{'precision': None, 'f1_score': None, 'recall': 0.2, 'accuracy': 0.7308982210557013, 'model_obj': 'LogisticRegression'}


In [70]:
model_performance = obj.apply_model_predict_validate(dtree, feature_names=obj.cat_num_extract()["num_cols"])

In [71]:
print(model_performance)

{'precision': None, 'f1_score': None, 'recall': 0.7677950656971821, 'accuracy': 0.8863371245261009, 'model_obj': 'DecisionTreeClassifier'}


In [72]:
rf_model = RandomForestClassifier()

In [73]:
model_performance  = obj.apply_model_predict_validate(rf_model, feature_names=obj.cat_num_extract()["num_cols"])



KeyboardInterrupt: 

In [None]:
print(model_performance)

#### Some basic models can be applied with a simple function call, well suited for baseline models

In [None]:
model_performance = obj.apply_log_reg(feature_names=obj.cat_num_extract()["num_cols"])

In [None]:
print(model_performance)

In [None]:
model_performance = obj.apply_dtree_class(feature_names=obj.cat_num_extract()["num_cols"])

In [None]:
print(model_performance)

### Compare the performance of different models

In [74]:
df = obj.compare_model_performance(model_objs_list= [log_reg, dtree],feature_names=obj.cat_num_extract()["num_cols"])

  y = column_or_1d(y, warn=True)


In [76]:
df

Unnamed: 0,precision,f1_score,recall,accuracy,model_obj
0,,,0.2,0.730898,LogisticRegression
1,,,0.754116,0.885025,DecisionTreeClassifier


In [77]:
obj.dataset.head()

Unnamed: 0,PERID,NRCH17_2,IRHHSIZ2,IIHHSIZ2,IRKI17_2,IIKI17_2,IRHH65_2,IIHH65_2,PRXRETRY,PRXYDATA,...,TOOLONG,TROUBUND,PDEN10,COUTYP2,MAIIN102,AIIND102,ANALWT_C,VESTR,VEREP,Criminal
0,25095143,2,4,1,3,1,1,1,99,99,...,1,2,1,1,2,2.0,3884.805998,40026,1,0
1,13005143,1,3,1,2,1,1,1,99,99,...,2,2,2,3,2,2.0,1627.108106,40015,2,1
2,67415143,1,2,1,2,1,1,1,99,99,...,2,2,2,3,2,2.0,4344.95798,40024,1,0
3,70925143,0,2,1,1,1,1,1,99,99,...,2,2,1,1,2,2.0,792.521931,40027,1,0
4,75235143,0,6,1,4,1,1,1,99,1,...,2,2,2,2,2,2.0,1518.118526,40001,2,0


### Scale the data

In [78]:
obj.setup_scaler_numeric_data()

obj.scale_numeric_data()
df = obj.compare_model_performance(model_objs_list= [log_reg, dtree],feature_names=obj.cat_num_extract()["num_cols"])
df

  y = column_or_1d(y, warn=True)


Unnamed: 0,precision,f1_score,recall,accuracy,model_obj
0,,,0.2,0.730898,LogisticRegression
1,,,0.775075,0.88641,DecisionTreeClassifier


In [79]:
obj.dataset.head()

Unnamed: 0,PERID,NRCH17_2,IRHHSIZ2,IIHHSIZ2,IRKI17_2,IIKI17_2,IRHH65_2,IIHH65_2,PRXRETRY,PRXYDATA,...,TOOLONG,TROUBUND,PDEN10,COUTYP2,MAIIN102,AIIND102,ANALWT_C,VESTR,VEREP,Criminal
0,-1.149595,2,0.401866,-0.027826,0.830368,-0.060384,-0.346689,-0.075279,0.129911,0.634535,...,-0.230263,-0.044382,-1.044857,-0.991267,0.145762,0.146465,-0.14112,0.008527,-0.987264,-0.273186
1,-1.622992,1,-0.298706,-0.027826,-0.07627,-0.060384,-0.346689,-0.075279,0.129911,0.634535,...,-0.041432,-0.044382,0.572231,1.601413,0.145762,0.146465,-0.535505,-0.032961,1.011838,3.660515
2,0.507489,1,-0.999279,-0.027826,-0.07627,-0.060384,-0.346689,-0.075279,0.129911,0.634535,...,-0.041432,-0.044382,0.572231,1.601413,0.145762,0.146465,-0.060738,0.000984,-0.987264,-0.273186
3,0.644927,0,-0.999279,-0.027826,-0.982909,-0.060384,-0.346689,-0.075279,0.129911,0.634535,...,-0.041432,-0.044382,-1.044857,-0.991267,0.145762,0.146465,-0.681295,0.012299,-0.987264,-0.273186
4,0.81369,0,1.80301,-0.027826,1.737007,-0.060384,-0.346689,-0.075279,0.129911,-1.576397,...,-0.041432,-0.044382,0.572231,0.305073,0.145762,0.146465,-0.554544,-0.085763,1.011838,-0.273186


### Optionally, you can pass a list of names for the models

In [80]:
df = obj.compare_model_performance(model_objs_list= [log_reg, dtree],model_names_list=["Baseline Logistic Regression", "Baseline Decision Tree"],
                                   feature_names=obj.cat_num_extract()["num_cols"])

  y = column_or_1d(y, warn=True)


In [81]:
df

Unnamed: 0,precision,f1_score,recall,accuracy,model_obj
0,,,0.2,0.730898,Baseline Logistic Regression
1,,,0.747865,0.886483,Baseline Decision Tree


In [82]:
df.dtypes

precision     object
f1_score      object
recall       float64
accuracy     float64
model_obj     object
dtype: object