In [13]:
import os, glob, math, json
import pandas as pd 
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from collections import OrderedDict

In [2]:
## one-hot-encoding of categorical features
def encode_onehot(df, cols):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.
    """
    vec = DictVectorizer()
    vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index
    
    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df

In [3]:
#projectdir = os.path.realpath(__file__).split('src')[0]
projectDir = "../../"
solDir = os.path.join(projectDir, "solution")
dataDir = os.path.join(projectDir, "data")
assert os.path.exists(solDir)
assert os.path.exists(dataDir)

In [4]:
print('loading in training data ....')

loading in training data ....


In [5]:
trainData = pd.read_csv(os.path.join(dataDir, "trainData.csv.gz"), sep=',', compression='gzip', index_col=0)
trainTargets = pd.read_csv(os.path.join(dataDir, "trainTargets.csv.gz"), sep=',', compression='gzip', index_col=0)

In [6]:
# Re-assign training data and targets for scikit-learn
X_train = trainData
y_train = trainTargets

In [7]:
# add word-length feature
print('adding derived word length feature ....')
X_train['word_length'] = X_train.apply(lambda row: len(row['Word']),axis=1)
# quick and dirty
X_train.drop(['Word'],axis=1, inplace=True)
# One-Hot Encode Categorical Variables
# simple categorical column detection
cat_cols = []
for index,val in X_train.tail(1).iteritems():
    if isinstance(val.values[0],str): # simple categorical feature detection
        cat_cols.append(index)
# one-hot encode
X_train = encode_onehot(X_train, cat_cols)

adding derived word length feature ....


In [8]:
print('training model on train data ...')
print('using 10-fold CV...')
cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
parameters = {'n_estimators':[100, 200, 300]}
rf = GridSearchCV(estimator=ensemble.RandomForestClassifier(class_weight="balanced"), 
                       param_grid=parameters, 
                       scoring='f1_macro', 
                       cv=cv)
# rf = ensemble.RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train['Level.Teachers.Average'])
# print(list(zip(X_train, rf.best_estimator_.feature_importances_)))
rf_score = rf.best_score_
print('f1 score on training CV', rf.best_score_)

training model on train data ...
using 10-fold CV...
f1 score on training CV 0.317546603623


In [9]:
print('making prediciton on test data ....')
print('loading test data ...')
if os.path.exists(os.path.join(dataDir, "testData.csv.gz")):
    testData = pd.read_csv(os.path.join(dataDir, "testData.csv.gz"), sep=',', compression='gzip', index_col=0)
    X_test = testData
    # add word-length feature
    X_test['word_length'] = X_test.apply(lambda row: len(row['Word']),axis=1)
    # quick and dirty
    X_test.drop(['Word'],axis=1, inplace=True)
    # One-Hot Encode Categorical Variables
    # simple categorical column detection
    cat_cols = []
    for index,val in X_test.tail(1).iteritems():
        if isinstance(val.values[0],str): # simple categorical feature detection
            cat_cols.append(index)
    # one-hot encode
    X_test = encode_onehot(X_test, cat_cols)
    print('calling predict ...')
    y_pred = pd.DataFrame(rf.predict(X_test))
    y_pred.columns = y_train.columns
    y_pred.index.name = y_train.index.name
    print('saving predictions to testTargets.csv ...')
    y_pred.to_csv('testTargets.csv')
else:
    print('looks like this is a redacted dataset. This step cannot be completed ...')

making prediciton on test data ....
loading test data ...
calling predict ...
saving predictions to testTargets.csv ...


In [10]:
print('computing performance on test data ...')
if os.path.exists(os.path.join(dataDir, "testTargets.csv.gz")):
    testTargets = pd.read_csv(os.path.join(dataDir, "testTargets.csv.gz"), sep=',', compression='gzip', index_col=0)
    y_test = testTargets
    y_pred = pd.read_csv('testTargets.csv')
    conf_mat = metrics.confusion_matrix(y_test['Level.Teachers.Average'], y_pred['Level.Teachers.Average'])
    f1 = metrics.f1_score(y_test['Level.Teachers.Average'], y_pred['Level.Teachers.Average'], average='macro')
    print('f1 score on test data', f1)
else:
    print('looks like this is a redacted dataset. This step cannot be completed ...')

computing performance on test data ...
f1 score on test data 0.339578993394


In [11]:
conf_mat

array([[ 57,  56,  28,   6,   2,   4,  29],
       [ 37,  91,  81,  22,   9,  10,  31],
       [ 17,  82, 143,  47,  13,  21,  24],
       [  7,  27,  68,  46,  15,  29,  57],
       [  0,   4,  12,   9,  13,  19,  35],
       [  0,   0,   0,   1,   4,  14,  19],
       [  2,   0,   0,   1,   6,  11, 191]])

In [14]:
train_performance = OrderedDict([
        ('train', OrderedDict([
            ('split', OrderedDict([
                    ('type', 'StratifiedKFold'),
                    ('n_splits', 10),
                    ('random_state', 42),
                    ('shuffle', True)
                    ])
            ),
            ('score', OrderedDict([
                    ('metric', 'f1Macro'),
                    ('value', rf_score)])
            )
        ]))
    ])


test_performance = OrderedDict([
        ('test', OrderedDict([
            ('score', OrderedDict([
                    ('metric', 'f1Macro'),
                    ('value', f1)])
            )
        ]))
    ])
print('saving performance.json ...')

overall_performance = OrderedDict()
overall_performance.update(train_performance)
overall_performance.update(test_performance)

with open('performance.json', 'w', encoding='utf-8') as f:
    json.dump(overall_performance, f, indent=2)
print(json.dumps(overall_performance, indent=2))

saving performance.json ...
{
  "train": {
    "split": {
      "type": "StratifiedKFold",
      "n_splits": 10,
      "random_state": 42,
      "shuffle": true
    },
    "score": {
      "metric": "f1Macro",
      "value": 0.31754660362282333
    }
  },
  "test": {
    "score": {
      "metric": "f1Macro",
      "value": 0.33957899339438263
    }
  }
}
