In [1]:
#%cd ../digits/
!ls

/home/thunder/Documents/learning/projects/kaggle/digit_recognizer/digits
data	     features.py   predict.py	README.md~	train.py
data_io.py   features.py~  __pycache__	SETTINGS.json	train.py~
data_io.py~  LICENSE	   README.md	SETTINGS.json~


In [10]:
# %load data_io.py
import csv
import json
import numpy as np
import os
import pandas as pd
import pickle

def get_paths():
    paths = json.loads(open("SETTINGS.json").read())
    for key in paths:
        paths[key] = os.path.expandvars(paths[key])
    return paths

def identity(x):
    return x

# For pandas >= 10.1 this will trigger the columns to be parsed as strings
converters = { "FullDescription" : identity
             , "Title": identity
             , "LocationRaw": identity
             , "LocationNormalized": identity
             }

def get_train_df():
    train_path = get_paths()["train_data_path"]
    return pd.read_csv(train_path, converters=converters)

def get_valid_df():
    valid_path = get_paths()["valid_data_path"]
    return pd.read_csv(valid_path, converters=converters)

def save_model(model):
    out_path = get_paths()["model_path"]
    pickle.dump(model, open(out_path, "wb"))

def load_model():
    in_path = get_paths()["model_path"]
    return pickle.load(open(in_path, "rb"))

def write_submission(predictions):
    prediction_path = get_paths()["prediction_path"]
    writer = csv.writer(open(prediction_path, "wb"), lineterminator="\n")
    valid = get_valid_df()
    rows = [x for x in zip(valid["Id"], predictions.flatten())]
    writer.writerow(("Id", "label"))
    writer.writerows(rows)


In [89]:
# %load train.py
import data_io
from features import FeatureMapper, SimpleTransform
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

def feature_extractor():
    features = [('FullDescription-Bag of Words', 'FullDescription', CountVectorizer(max_features=100)),]
    combined = FeatureMapper(features)
    return combined

def get_pipeline():
    features = feature_extractor()
    steps = [
             #("extract_features", features),
             ("classify", RandomForestRegressor(n_estimators=3, 
                                                verbose=2,
                                                n_jobs=1,
                                                min_samples_split=30,
                                                random_state=3465343))]
    return Pipeline(steps)

print("Reading in the training data")
train = data_io.get_train_df()
print(train.head())

Reading in the training data
   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8    ...     pixel774  pixel775  pixel776  pixel777  pixel778  \
0       0    ...            0         0         0         0         0   
1       0    ...            0         0         0         0         0   
2       0    ...            0         0         0         0         0   
3       0    ...            0         0         0         0         0   
4       0    ...            0         0         0         0         0   

   pixel779  pixel780  pixel781  pixel782  pixel783  
0         0         0

In [90]:
print("Extracting features and training model")
classifier = get_pipeline()
classifier.fit(train[[x for x in train.columns if x != 'label']], train['label'])

print("Saving the classifier")
data_io.save_model(classifier)

Extracting features and training model
building tree 1 of 3
building tree 2 of 3
building tree 3 of 3

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   11.4s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   32.7s finished



Saving the classifier


In [93]:
# %load predict.py
import data_io
import pickle

print("Loading the classifier")
classifier = data_io.load_model()

print("Making predictions") 
valid = data_io.get_valid_df()

Loading the classifier
Making predictions


In [98]:
valid.shape

(28000, 784)

In [104]:
predictions = classifier.predict(valid)   

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


In [105]:
predictions.shape

(28000,)

In [106]:
predictions

array([ 2.        ,  0.        ,  8.48793651, ...,  3.        ,
        9.        ,  2.        ])

In [107]:
print("Writing predictions to file")
data_io.write_submission(predictions)

Writing predictions to file


KeyError: 'Id'

In [109]:
# %load data_io.py
import csv
import json
import numpy as np
import os
import pandas as pd
import pickle

def get_paths():
    paths = json.loads(open("SETTINGS.json").read())
    for key in paths:
        paths[key] = os.path.expandvars(paths[key])
    return paths

def identity(x):
    return x

# For pandas >= 10.1 this will trigger the columns to be parsed as strings
converters = { "FullDescription" : identity
             , "Title": identity
             , "LocationRaw": identity
             , "LocationNormalized": identity
             }

def get_train_df():
    train_path = get_paths()["train_data_path"]
    return pd.read_csv(train_path, converters=converters)

def get_valid_df():
    valid_path = get_paths()["valid_data_path"]
    return pd.read_csv(valid_path, converters=converters)

def save_model(model):
    out_path = get_paths()["model_path"]
    pickle.dump(model, open(out_path, "wb"))

def load_model():
    in_path = get_paths()["model_path"]
    return pickle.load(open(in_path, "rb"))

In [111]:
prediction_path = get_paths()["prediction_path"]
prediction_path

'data/submissions/random_forest_benchmark.csv'

In [112]:
writer = csv.writer(open(prediction_path, "wb"), lineterminator="\n")
valid = get_valid_df()
valid

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [115]:
rows = [x for x in predictions.flatten()]
rows

[2.0,
 0.0,
 8.4879365079365083,
 7.1313131313131315,
 1.5515873015873016,
 7.0,
 0.0,
 3.0,
 0.0,
 2.925925925925926,
 5.1851851851851851,
 4.0,
 5.0611111111111109,
 0.0,
 4.0,
 2.7763677183032023,
 3.3833333333333333,
 1.0,
 9.0,
 0.0,
 9.0,
 1.0,
 1.0,
 3.3333333333333335,
 7.0,
 4.0,
 1.986013986013986,
 7.0148148148148151,
 4.0,
 6.9145299145299148,
 6.5,
 6.1628067273228559,
 4.0,
 2.0,
 5.916666666666667,
 6.8414814814814813,
 6.4833333333333334,
 5.0,
 3.5115440115440113,
 5.5733893557422975,
 7.0,
 7.0,
 4.0,
 9.0,
 8.0,
 6.9349593495934956,
 8.0,
 5.9523809523809526,
 6.666666666666667,
 2.9463869463869465,
 6.0,
 7.6045548654244302,
 7.9848631239935584,
 3.5,
 8.0833333333333339,
 3.0606060606060606,
 1.0,
 1.3961988304093567,
 3.9900695134061572,
 5.4722222222222223,
 4.0,
 1.0,
 7.0,
 0.0,
 0.0,
 0.0,
 1.0,
 8.445736434108527,
 0.0,
 1.0,
 6.0,
 5.3798449612403099,
 5.0156695156695159,
 6.4444444444444455,
 2.9086838694473904,
 7.8589743589743586,
 5.5290594498669039,
 8.

In [None]:
writer.writerow(("Id", "label"))
writer.writerows(rows)

In [118]:
help(os.write
    )

Help on built-in function write in module posix:

write(...)
    write(fd, data) -> byteswritten
    
    Write bytes to a file descriptor.

