In [7]:
import csv
import random
import numpy  as np


#load train set
def loadTrainSet():
	traindata = []
	trainlabel = []
	table = {"Class_1":1,"Class_2":2,"Class_3":3,"Class_4":4,"Class_5":5,"Class_6":6,"Class_7":7,"Class_8":8,"Class_9":9}
	with open("./all/train.csv") as f:
		rows = csv.reader(f)
		rows.next()
		for row in rows:
			l = []
			for i in range(1,94):
				l.append(int(row[i]))
			traindata.append(l)
			trainlabel.append(table.get(row[-1]))
	f.close()

	traindata = np.array(traindata,dtype="float")
	trainlabel = np.array(trainlabel,dtype="int")
	#Standardize(zero-mean,nomalization)
	mean = traindata.mean(axis=0)
	std = traindata.std(axis=0)
	traindata = (traindata - mean)/std
	
	#shuffle the data
	randomIndex = [i for i in xrange(len(trainlabel))]
	random.shuffle(randomIndex)
	traindata = traindata[randomIndex]
	trainlabel = trainlabel[randomIndex]
	return traindata,trainlabel

#load test set
def loadTestSet():
	testdata = []
	with open("./all/test.csv") as f:
		rows = csv.reader(f)
		rows.next()
		for row in rows:
			l = []
			for i in range(1,94):
				l.append(int(row[i]))
			testdata.append(l)
	f.close()
	testdata = np.array(testdata,dtype="float")
	#Standardize(zero-mean,nomalization)
	mean = testdata.mean(axis=0)
	std = testdata.std(axis=0)
	testdata = (testdata - mean)/std
	return testdata


#Evaluation function
#Refer to:https://www.kaggle.com/c/otto-group-product-classification-challenge/details/evaluation
def evaluation(label,pred_label):
	num = len(label)
	logloss = 0.0
	for i in range(num):
		p = max(min(pred_label[i][label[i]-1],1-10**(-15)),10**(-15))
		logloss += np.log(p)
	logloss = -1*logloss/num
	return logloss


#save result as csv file
def saveResult(testlabel,filename = "./all/submission.csv"):
	with open(filename,'wb') as myFile:
		myWriter=csv.writer(myFile)
		myWriter.writerow(["id","Class_1","Class_2","Class_3","Class_4","Class_5","Class_6","Class_7","Class_8","Class_9"])
		id_num = 1
		for eachlabel in testlabel:
			l = []
			l.append(id_num)
			l.extend(eachlabel)
			myWriter.writerow(l)
			id_num += 1

In [8]:
import time
#import preprocess

from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble, feature_extraction, preprocessing
from sklearn.calibration import CalibratedClassifierCV

# from otto_utils import consts, utils

def loaddata():
	print "loading data..."
	#load data in train.csv, divided into train data and validation data
	data,label = loadTrainSet()
	val_data = data[0:6000]
	val_label = label[0:6000]
	train_data = data[6000:]
	train_label = label[6000:]
	#load data in test.csv
	test_data = loadTestSet()
	return train_data,train_label,val_data,val_label,test_data




def rf(train_data,train_label,val_data,val_label,test_data,name="./all/RandomForest_submission.csv"):
	print "Start training Random forest..."
	rfClf = RandomForestClassifier(n_estimators=120,n_jobs=-1)
	rfClf.fit(train_data,train_label)
	#evaluate on validation set
	calibrated_classifier = CalibratedClassifierCV(rfClf, method='isotonic')
	fitted_classifier = calibrated_classifier.fit(train_data, train_label)
	val_pred_label = fitted_classifier.predict_proba(val_data)
# 	test_label = rfClf.predict_proba(test_data)
# 	val_pred_label = rfClf.predict_proba(val_data)
	logloss = evaluation(val_label,val_pred_label)
	print "logloss of validation set:",logloss

	print "Start classify test set..."
	calibrated_classifier = CalibratedClassifierCV(rfClf, method='isotonic')
	fitted_classifier = calibrated_classifier.fit(train_data, train_label)
	test_label = fitted_classifier.predict_proba(test_data)
# 	test_label = rfClf.predict_proba(test_data)
	saveResult(test_label,filename = name)



if __name__ == "__main__":
	t1 = time.time()
	train_data,train_label,val_data,val_label,test_data = loaddata()
	rf(train_data,train_label,val_data,val_label,test_data) 
	t2 = time.time()
	print "Done! It cost",t2-t1,"s"

loading data...
Start training Random forest...
logloss of validation set: 0.491747322316913
Start classify test set...
Done! It cost 234.059605837 s


In [2]:
import pandas as pd
data=pd.read_csv('./all/train.csv')

In [10]:
type(data)

pandas.core.frame.DataFrame

In [11]:
data?

In [5]:
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [6]:
data.tail()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
61873,61874,1,0,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,2,0,Class_9
61874,61875,4,0,0,0,0,0,0,0,0,...,0,2,0,0,2,0,0,1,0,Class_9
61875,61876,0,0,0,0,0,0,0,3,1,...,0,3,1,0,0,0,0,0,0,Class_9
61876,61877,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,3,10,0,Class_9
61877,61878,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,Class_9


In [8]:
data.shape

(61878, 95)

In [12]:
f1=data.feat_1.shape

In [17]:
(data==0).astype(int).sum(axis=0)

id             0
feat_1     51483
feat_2     55018
feat_3     49295
feat_4     48448
feat_5     58907
feat_6     60710
feat_7     56443
feat_8     45312
feat_9     49836
feat_10    54195
feat_11    45043
feat_12    55342
feat_13    50430
feat_14    34542
feat_15    43770
feat_16    31649
feat_17    51748
feat_18    44037
feat_19    56122
feat_20    49044
feat_21    54544
feat_22    40873
feat_23    57470
feat_24    22077
feat_25    27295
feat_26    49180
feat_27    52827
feat_28    54009
feat_29    54521
           ...  
feat_65    52706
feat_66    44352
feat_67    23930
feat_68    53144
feat_69    53174
feat_70    40241
feat_71    51493
feat_72    43995
feat_73    51527
feat_74    50047
feat_75    48568
feat_76    48487
feat_77    58354
feat_78    55574
feat_79    54697
feat_80    48565
feat_81    58695
feat_82    56442
feat_83    53668
feat_84    60455
feat_85    48914
feat_86    36516
feat_87    49859
feat_88    41844
feat_89    48248
feat_90    53542
feat_91    57030
feat_92    482