In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import warnings

warnings.filterwarnings('ignore')

data = pd.read_csv('dense_data_type_and_emot.csv', header = None)
y_label = np.array(data[[0,1]])
data.drop(data.columns[[0,1]],axis=1,inplace=True)

In [2]:
feature = []
for item in list(data.columns):
    name = 'feature' + str(int(item)-1)
    feature.append(name)

data.columns = feature

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

gbdt_clf = GradientBoostingClassifier(learning_rate=0.01,max_depth=5,max_features=20,n_estimators=50)
scores = cross_val_score(gbdt_clf, data, y_label[:,0], cv=10)
scores

array([0.67729084, 0.67729084, 0.67729084, 0.67729084, 0.676     ,
       0.68      , 0.67871486, 0.67871486, 0.67871486, 0.67871486])

In [4]:
gbdt_clf.fit(data, y_label[:,0])
importance = gbdt_clf.feature_importances_
names = data.columns.values.tolist()
sort_list = sorted(zip(map(lambda x: round(x, 3), gbdt_clf.feature_importances_), names), reverse=True)

In [5]:
select_feature=[]
for k,v in sort_list:
    if k != 0.0:
        select_feature.append(v)
select_data = data[select_feature]

In [8]:
from sklearn.grid_search import GridSearchCV
warnings.filterwarnings('ignore')

gbdt_param_grid = {'learning_rate':[0.005,0.01,0.1],
                   'n_estimators':[20,50,100],
                   'max_depth':[5,10,15],
                   'max_features':np.arange(10,90,20),
                   }

gbdt_clf = GradientBoostingClassifier(**gbdt_param_grid)

gbdt_grid = GridSearchCV(gbdt_clf,
                    param_grid=gbdt_param_grid,
                    cv=5,
                    verbose=1,
                    scoring='accuracy')

gbdt_grid.fit(select_data, y_label[:,0])
gbdt_grid.best_params_, gbdt_grid.best_score_

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed: 19.1min finished


({'learning_rate': 0.005,
  'max_depth': 5,
  'max_features': 10,
  'n_estimators': 20},
 0.6776)

In [12]:
from sklearn.svm import SVC

svm_param_grid = {'C':[0.001,0.01,0.1],
                   'kernel':['rbf', 'linear', 'poly', 'sigmoid']}

svm_clf = SVC(**svm_param_grid)

svm_grid = GridSearchCV(svm_clf,
                    param_grid=svm_param_grid,
                    cv=5,
                    verbose = 1,
                    scoring='accuracy')

svm_grid.fit(select_data, y_label[:,0])
svm_grid.best_params_, svm_grid.best_score_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 35.3min finished


({'C': 0.001, 'kernel': 'rbf'}, 0.6776)

In [24]:
from sklearn.ensemble import VotingClassifier

gbdt_model = GradientBoostingClassifier(**gbdt_grid.best_params_)
svm_model = SVC(**svm_grid.best_params_)

estimators=[('gbdt', gbdt_model),('svm', svm_model),('gbdt_2', gbdt_model)]
ensemble = VotingClassifier(estimators, voting='hard')
modified_scores = cross_val_score(ensemble, select_data, y_label[:,0], cv=5, scoring='accuracy')
#ensemble.fit(select_data, y_label[:,0])
np.mean(modified_scores)

0.6776002832011329

In [26]:
from sklearn import preprocessing
data_scale = preprocessing.scale(select_data)

In [27]:
warnings.filterwarnings('ignore')

gbdt_param_grid_ = {'learning_rate':[0.005,0.01,0.1],
                   'n_estimators':[20,50,100],
                   'max_depth':[5,10,15],
                   'max_features':np.arange(10,90,20),
                   }

gbdt_clf_ = GradientBoostingClassifier(**gbdt_param_grid_)

gbdt_grid_ = GridSearchCV(gbdt_clf_,
                    param_grid=gbdt_param_grid_,
                    cv=5,
                    verbose=1,
                    scoring='accuracy')

gbdt_grid_.fit(data_scale, y_label[:,0])
gbdt_grid_.best_params_, gbdt_grid_.best_score_

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed: 19.0min finished


({'learning_rate': 0.005,
  'max_depth': 10,
  'max_features': 10,
  'n_estimators': 100},
 0.678)

In [28]:
svm_param_grid_ = {'C':[0.001,0.01,0.1],
                   'kernel':['rbf', 'linear', 'poly', 'sigmoid']}

svm_clf_ = SVC(**svm_param_grid_)

svm_grid_ = GridSearchCV(svm_clf_,
                    param_grid=svm_param_grid_,
                    cv=5,
                    verbose = 1,
                    scoring='accuracy')

svm_grid_.fit(select_data, y_label[:,0])
svm_grid_.best_params_, svm_grid_.best_score_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 35.6min finished


({'C': 0.001, 'kernel': 'rbf'}, 0.6776)