In [43]:
import numpy as np
import pandas as pd
from ML.HelperFunctionsMLClass import HelperFunctionsML
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### Read the dataset using pandas

In [44]:
temp = pd.read_csv("../tests/criminal_train.csv")

In [45]:
temp.AIIND102[-10:] = np.nan # setting some values to null

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [46]:
temp.columns[temp.isnull().sum()>0] 

Index(['AIIND102'], dtype='object')

In [5]:
temp.IFATHER = temp.IFATHER.astype("str")
temp.IFATHER[-10:] = None # setting some values to null

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Create an object of the class HelperFunctionsML 

In [6]:
obj = HelperFunctionsML(temp)

#### Alternatively
you can create the object like this 

```obj = HelperFunctionsML(pd.read_csv("../tests/criminal_train.csv"))```

In [49]:
obj.__doc__

'Helper functions for Machine Learning and EDA'

In [50]:
obj.nrows

45718

In [51]:
obj.ncols

72

In [52]:
obj.check_has_na_values()

True

In [53]:
obj.list_of_na_cols(dataset= temp)

['IFATHER', 'AIIND102']

In [54]:
obj.impute_categorical_cols()
obj.check_has_na_values()

True

In [55]:
obj.impute_numeric_cols()
obj.check_has_na_values()

False

In [56]:
obj.impute_numeric_cols()
obj.check_has_na_values()

False

In [14]:
obj.impute_numeric_cols()
obj.check_has_na_values()

False

In [15]:
obj.create_train_test_split()

target not set, call the function set_target with the name of the target column


In [16]:
obj.set_target("NRCH17_2")

In [17]:
X_train, X_test, y_train, y_test = obj.create_train_test_split(return_frames=True)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [18]:
obj.target

'NRCH17_2'

In [19]:
obj.nrows

45718

In [20]:
obj.nrowvalidation

13716

In [21]:
X_train.shape

(32002, 71)

In [22]:
obj.ncolstrain

71

In [23]:
obj.ncolsvalidation

71

In [24]:
obj.target

'NRCH17_2'

In [25]:
obj.set_target_type(col_type = "str")

In [26]:
log_reg = LogisticRegression()
dtree = DecisionTreeClassifier()

In [27]:
model_performance = obj.apply_model_predict_validate(model_obj = log_reg, feature_names=obj.cat_num_extract()["num_cols"])

  y = column_or_1d(y, warn=True)


In [28]:
print(model_performance)

{'precision': None, 'f1_score': None, 'recall': 0.2, 'accuracy': 0.7308982210557013, 'model_obj': 'LogisticRegression'}


In [29]:
model_performance = obj.apply_model_predict_validate(dtree, feature_names=obj.cat_num_extract()["num_cols"])

In [30]:
print(model_performance)

{'precision': None, 'f1_score': None, 'recall': 0.7488298799286209, 'accuracy': 0.8859725867599884, 'model_obj': 'DecisionTreeClassifier'}


In [31]:
rf_model = RandomForestClassifier()

In [32]:
model_performance  = obj.apply_model_predict_validate(rf_model, feature_names=obj.cat_num_extract()["num_cols"])



In [33]:
print(model_performance)

{'precision': None, 'f1_score': None, 'recall': 0.7609031883623486, 'accuracy': 0.9198016914552347, 'model_obj': 'RandomForestClassifier'}


#### Some basic models can be applied with a simple function call, well suited for baseline models

In [34]:
model_performance = obj.apply_log_reg(feature_names=obj.cat_num_extract()["num_cols"])

  y = column_or_1d(y, warn=True)


In [35]:
print(model_performance)

{'precision': None, 'f1_score': None, 'recall': 0.2, 'accuracy': 0.7308982210557013, 'model_obj': 'LogisticRegression'}


In [36]:
model_performance = obj.apply_dtree_class(feature_names=obj.cat_num_extract()["num_cols"])

In [37]:
print(model_performance)

{'precision': None, 'f1_score': None, 'recall': 0.7558038849319666, 'accuracy': 0.8852435112277632, 'model_obj': 'DecisionTreeClassifier'}


### Compare the performance of different models

In [38]:
df = obj.compare_model_performance(model_objs_list= [log_reg, dtree],feature_names=obj.cat_num_extract()["num_cols"])

  y = column_or_1d(y, warn=True)


In [39]:
df

Unnamed: 0,precision,f1_score,recall,accuracy,model_obj
0,,,0.2,0.730898,LogisticRegression
1,,,0.765163,0.885681,DecisionTreeClassifier


#### Optionally, you can pass a list of names for the models

In [40]:
df = obj.compare_model_performance(model_objs_list= [log_reg, dtree],model_names_list=["Baseline Logistic Regression", "Baseline Decision Tree"],
                                   feature_names=obj.cat_num_extract()["num_cols"])

  y = column_or_1d(y, warn=True)


In [41]:
df

Unnamed: 0,precision,f1_score,recall,accuracy,model_obj
0,,,0.2,0.730898,Baseline Logistic Regression
1,,,0.773564,0.885316,Baseline Decision Tree


In [42]:
df.dtypes

precision     object
f1_score      object
recall       float64
accuracy     float64
model_obj     object
dtype: object