In [14]:
# Load Train and Test Data
import pandas as pd
import numpy as np

train_df = pd.read_csv("train-patient.csv")

test_df = pd.read_csv("test-patient.csv")

In [15]:
train_df.replace('NA', 0, inplace=True)
test_df.replace('NA', 0, inplace=True)

In [16]:
# Separate features and label
train_label = train_df['PID_State']
train_features = train_df.drop(['PID_State', 'PID'], axis=1)

test_features = test_df.drop(['PID_State', 'PID'], axis=1)

In [17]:
# Encode label categories
train_label = train_label.apply(lambda x: 0 if x=='P' else (1 if x == 'DO' else 2))
train_label.unique()

array([0, 1, 2])

In [18]:
# Fill empty values with zero
train_features.fillna(0, inplace=True)
test_features.fillna(0, inplace=True)

In [19]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

train_features = scaler.fit_transform(train_features)
test_features = scaler.fit_transform(test_features)

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_features, train_label, test_size=0.20)

In [21]:
X_train.shape

(4000, 79)

In [22]:
# from sklearn.decomposition import PCA

# pca = PCA(.95)
# pca.fit(X_train)
# X_train = pca.transform(X_train)
# X_test = pca.transform(X_test)

# test_features = pca.transform(test_features)

In [23]:
# Try Random Forest
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sklearn.svm import SVC

clf = RandomForestClassifier(criterion='entropy')
#clf = KNeighborsClassifier()
#clf = XGBClassifier()
#clf = SVC()
#clf = DecisionTreeClassifier(max_depth=2)
#clf = GradientBoostingClassifier()

clf.fit(X_train[:3000], y_train[:3000])

y_pred = clf.predict(X_test)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
sig_clf.fit(X_train[3000:4000], y_train[3000:4000])

y_pred = sig_clf.predict(X_test)

**Random Forest has given results**

In [24]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.857

In [38]:
# TODO: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

# TODO: Initialize the classifier
clf = RandomForestClassifier(criterion='entropy')
#clf = GradientBoostingClassifier()

# TODO: Create the parameters list you wish to tune, using a dictionary if needed.
# HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]}
parameters = {'n_estimators': [10, 50, 75, 100], 'max_depth': [2, 25, 50, 75, 100]}

# TODO: Make an fbeta_score scoring object using make_scorer()
scorer = make_scorer(accuracy_score)

# TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(clf, parameters, scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_
print best_clf
# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


Unoptimized model
------
Accuracy score on testing data: 0.8470

Optimized Model
------
Final accuracy score on the testing data: 0.8680


In [104]:
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(best_clf.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.index.values

array(['var8', 'var9', 'var29', 'var10', 'var42', 'var35', 'var6',
       'var13', 'var11', 'var31', 'var5', 'var46', 'var37', 'var38',
       'var45', 'var47', 'var33', 'var40', 'var44', 'var39', 'var48',
       'var43', 'var50', 'var4', 'var16', 'var32', 'var23', 'var30',
       'var22', 'var21', 'var78', 'var57', 'var26', 'var77', 'var2',
       'var20', 'var58', 'var34', 'var59', 'var62', 'var53', 'var60',
       'var52', 'var61', 'var79', 'var28', 'var36', 'var41', 'var54',
       'var75', 'var12', 'var56', 'var55', 'var63', 'var27', 'var24',
       'var65', 'var7', 'var76', 'var74', 'var69', 'var68', 'var25',
       'var67', 'var64', 'var51', 'var18', 'var19', 'var1', 'var66',
       'var14', 'var15', 'var70', 'var71', 'var72', 'var73', 'var17',
       'var49', 'var3'], dtype=object)

In [115]:
x = importances.index.values
y = x[:30].tolist()

In [116]:
# Import functionality for cloning a model
from sklearn.base import clone

# Reduce the feature space
X_train_reduced = X_train[y]
X_test_reduced = X_test[y]

# Train on the "best" model found from grid search earlier
clf = (clone(best_clf)).fit(X_train_reduced, y_train)

# Make new predictions
reduced_predictions = clf.predict(X_test_reduced)

# Report scores from the final model using both versions of data
print("Final Model trained on full data\n------")
print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("\nFinal Model trained on reduced data\n------")
print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions)))

Final Model trained on full data
------
Accuracy on testing data: 0.8850

Final Model trained on reduced data
------
Accuracy on testing data: 0.8820


In [39]:
predictions = best_clf.predict(test_features)

ValueError: Number of features of the model must match the input. Model n_features is 38 and input n_features is 79 

In [26]:
converter = lambda x: str('P') if x==0 else (str('D0') if x == 1 else str('DS'))
vfunc = np.vectorize(converter, otypes=[object])
predictions_conv = vfunc(predictions) 
predictions_conv

array(['D0', 'D0', 'P', ..., 'D0', 'D0', 'D0'], dtype=object)

In [27]:
np.savetxt("output.csv", predictions_conv, delimiter=",", fmt='%s')