In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn import svm
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.kernel_approximation import RBFSampler

In [2]:
traindata = pd.read_csv("./TraData.csv", low_memory=False)

# remove the meaningless feature
traindata = traindata.drop(['dclkVerticals', 'ip', 'spaceId', 'deviceType', 'campaignId'], axis = 1).loc[:,:]

# replace null value to other data~~
traindata = traindata.fillna({'spaceCat': 'TEST'})

print(traindata.shape)
traindata.head()

(961457, 8)


Unnamed: 0,adx,spaceType,spaceCat,adType,os,publisherId,advertiserId,click
0,oN9K9JVNmtH69Q1KBbhHUtZ34mw/vdefue3wgNg/kWU=,site,IAB12,"[3,9]",Android,5ad6008850,266,0
1,oN9K9JVNmtH69Q1KBbhHUtZ34mw/vdefue3wgNg/kWU=,site,IAB14_1,"[3,9]",Android,ca10720d22,215,0
2,oN9K9JVNmtH69Q1KBbhHUtZ34mw/vdefue3wgNg/kWU=,site,IAB12,"[3,9]",Android,adcfa8aac4,215,0
3,oN9K9JVNmtH69Q1KBbhHUtZ34mw/vdefue3wgNg/kWU=,site,IAB12,"[3,9]",Windows,de184afdf3,173,0
4,oN9K9JVNmtH69Q1KBbhHUtZ34mw/vdefue3wgNg/kWU=,site,IAB12,"[3,9]",Android,5ad6008850,266,0


In [3]:
# Check null value
traindata[traindata.isnull().any(axis=1)].head()

Unnamed: 0,adx,spaceType,spaceCat,adType,os,publisherId,advertiserId,click


In [4]:
x = np.array(traindata.iloc[:, 0:7])
y = np.array(traindata.iloc[:, 7])

for i in range(0, 7):
    labelEncoder = LabelEncoder()
    x[:,i] = labelEncoder.fit_transform(x[:,i])
display(x)

array([[1, 1, 3, ..., 0, 29, 15],
       [1, 1, 5, ..., 0, 55, 7],
       [1, 1, 3, ..., 0, 46, 7],
       ..., 
       [1, 1, 3, ..., 5, 48, 15],
       [1, 1, 5, ..., 0, 55, 3],
       [1, 1, 5, ..., 8, 55, 3]], dtype=object)

In [13]:
# split the train and test data
train_X, test_X, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state=42)

In [6]:
#trans_data_X = pd.get_dummies(data_x, columns=list(data_x)[:-1])

In [18]:
# Standardization
# 可以跑可以不跑~

sc = StandardScaler()
sc.fit(train_X)
train_X = sc.transform(train_X)
test_X = sc.transform(test_X)
display(train_X)



array([[ 0.03568091,  0.05742403, -0.39702735, ..., -1.08325137,
        -0.49057641,  1.06627218],
       [ 0.03568091,  0.05742403, -0.39702735, ...,  0.30552063,
        -0.49057641, -0.56348567],
       [ 0.03568091,  0.05742403, -0.01642878, ...,  1.13878383,
         1.96653716, -0.56348567],
       ..., 
       [ 0.03568091,  0.05742403,  3.0283598 , ..., -1.08325137,
         0.92698911, -1.10673829],
       [ 0.03568091,  0.05742403, -0.39702735, ...,  1.13878383,
         2.43905899,  1.06627218],
       [ 0.03568091,  0.05742403, -0.39702735, ..., -1.08325137,
        -0.49057641,  1.06627218]])

In [11]:
clf = DecisionTreeClassifier()
clf.fit(train_X, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [12]:
test_y_predicted = clf.predict(test_X)
accuracy = accuracy_score(test_y, test_y_predicted)
print('結果  : ', np.count_nonzero(test_y_predicted), '/', len(test_y_predicted))
print('準確率: ', accuracy)
fpr, tpr, thresholds = metrics.roc_curve(test_y, test_y_predicted)
auc = metrics.auc(fpr, tpr)
print('AUC: ', auc, '\n')
print(classification_report(test_y, test_y_predicted))

結果  :  0 / 192292
準確率:  0.999381149502
AUC:  0.5 

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    192173
          1       0.00      0.00      0.00       119

avg / total       1.00      1.00      1.00    192292



  'precision', 'predicted', average, warn_for)


In [16]:
# output result
#pd.Series(test_y_predicted).to_csv('out.csv', sep=',')

In [19]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
test_y_predicted = clf.predict(test_X)
accuracy = accuracy_score(test_y, test_y_predicted)
print('結果  : ', np.count_nonzero(test_y_predicted), '/', len(test_y_predicted))
print('準確率: ', accuracy)
fpr, tpr, thresholds = metrics.roc_curve(test_y, test_y_predicted)
auc = metrics.auc(fpr, tpr)
print('AUC: ', auc, '\n')
print(classification_report(test_y_predicted, test_y))

結果  :  0 / 192292
準確率:  0.999381149502
AUC:  0.5 

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    192292
          1       0.00      0.00      0.00         0

avg / total       1.00      1.00      1.00    192292



  'recall', 'true', average, warn_for)


In [21]:
clf = KNeighborsClassifier(n_neighbors=2, n_jobs=-1)
clf.fit(train_X, train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [22]:
test_y_predicted = clf.predict(test_X)
accuracy = accuracy_score(test_y, test_y_predicted)
print('結果  : ', np.count_nonzero(test_y_predicted), '/', len(test_y_predicted))
print('準確率: ', accuracy)
fpr, tpr, thresholds = metrics.roc_curve(test_y, test_y_predicted)
auc = metrics.auc(fpr, tpr)
print('AUC: ', auc, '\n')
print(classification_report(test_y_predicted, test_y))

結果  :  4 / 192292
準確率:  0.999370748653
AUC:  0.504193875205 

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    192288
          1       0.01      0.25      0.02         4

avg / total       1.00      1.00      1.00    192292



In [None]:
clf = svm.SVC()
clf.fit(train_X, train_y)

In [None]:
test_y_predicted = clf.predict(test_X)
accuracy = accuracy_score(test_y, test_y_predicted)
print('結果  : ', np.count_nonzero(test_y_predicted), '/', len(test_y_predicted))
print('準確率: ', accuracy)
fpr, tpr, thresholds = metrics.roc_curve(test_y, test_y_predicted)
auc = metrics.auc(fpr, tpr)
print('AUC: ', auc, '\n')
print(classification_report(test_y_predicted, test_y))

In [None]:
clf = SGDClassifier(tol=None, max_iter=10)
clf.fit(train_X, train_y)

In [None]:
test_y_predicted = clf.predict(test_X)
accuracy = accuracy_score(test_y, test_y_predicted)
print('結果  : ', np.count_nonzero(test_y_predicted), '/', len(test_y_predicted))
print('準確率: ', accuracy)
fpr, tpr, thresholds = metrics.roc_curve(test_y, test_y_predicted)
auc = metrics.auc(fpr, tpr)
print('AUC: ', auc, '\n')
print(classification_report(test_y_predicted, test_y))