In [None]:
#Created on Mon Mar  9 22:09:02 2020
#
#@author: Ramin Mehdizad Tekiyeh

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [None]:
# here we import data set from csv file
df_original=pd.read_csv("ComputerSpeedData.csv")
df=df_original

In [None]:
# calculating GPU run time average
df["RunF"]=(df["Run1 (ms)"]+df["Run2 (ms)"]+df["Run3 (ms)"]+df["Run4 (ms)"])/4
# dropping 4 run timecolumns
df=df.drop(columns=['Run1 (ms)','Run2 (ms)','Run3 (ms)','Run4 (ms)'])
df['RunF'].mean()
# putting average run time in a new target column
df['RunF_binary']=np.where(df['RunF']>=100,1,0)
dff_original=df

In [None]:
# splitting data into train and test sets
train,test=train_test_split(dff_original,test_size=0.3,random_state=5000)

In [None]:
# these are data frames
x_train=train.iloc[:,0:14]
y_train=train.iloc[:,-1]
x_test=test.iloc[:,0:14]
y_test=test.iloc[:,-1]
print('x_train.shape[0] is:     ',x_train.shape[0])

In [None]:
#---------------------use decision tree--------------------
# here we use decision tree model on the whole data set
# defining decision tree classifier
clf=DecisionTreeClassifier(criterion='gini')
# training the model
clf=clf.fit(x_train,y_train)
# applying model on both train and test data
y_pred_train=clf.predict(x_train)
y_pred_test=clf.predict(x_test)

print("accuracy_score(y_train,y_pred_train) is: \n",
      accuracy_score(y_train,y_pred_train),end='\n\n')
print("accuracy_score(y_test,y_pred_test) is: \n",
      accuracy_score(y_test,y_pred_test),end='\n\n')
print("onfusion_matrix(y_pred_train,y_train) is: \n",
      confusion_matrix(y_pred_train,y_train),end='\n\n')
print("confusion_matrix(y_pred_test,y_test) is: \n",
      confusion_matrix(y_pred_test,y_test),end='\n\n')


In [None]:
# this function defines decision tree models with different depths
# and then applies it on the data set and calculates accuracy
def dtmetrics (x_tr,x_ts,y_tr,y_ts,max_depth):
    # initializing the lists
    train_results=[]
    test_results=[]
    train_accuracy=[]
    test_accuracy=[]
    for max_depth in max_depths:
        # defining model
       dt=DecisionTreeClassifier(max_depth=max_depth)
       # treaining the model
       dt.fit(x_tr,y_tr)
       # predicting train and test data
       train_pred=dt.predict(x_tr)
       test_pred=dt.predict(x_ts) 
       fpr,tpr,thresholds=roc_curve(y_tr,train_pred)
       roc_auc=auc(fpr,tpr)
       # Add auc score to previous train results
       train_results.append(roc_auc)
       y_pred=dt.predict(x_ts)
       fpr,tpr,thresholds=roc_curve(y_ts,y_pred)
       roc_auc=auc(fpr,tpr)
       # Add auc score to previous test results
       test_results.append(roc_auc)
       train_acc=accuracy_score(y_tr,train_pred)
       test_acc=accuracy_score(y_ts,test_pred)
       train_accuracy.append(train_acc)
       test_accuracy.append(test_acc)
    return test_accuracy,train_accuracy,test_results,train_results

In [None]:
# we define a series of tree depth to be used in calissifier models
max_depths=np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22])
# use the function to calculate accuracy
test_accuracy,train_accuracy,test_AUC,train_AUC=dtmetrics(x_tr=x_train,x_ts=x_test,y_tr=y_train,y_ts=y_test,max_depth=max_depths)

In [None]:
# plotting the results
plt.plot(max_depths,train_AUC,'b',label='Train AUC')
plt.plot(max_depths,test_AUC,'r',label='Test AUC')
plt.legend(loc='best',frameon=False)
plt.ylabel("AUC score")
plt.xlabel("Tree depth")
plt.title("AUC VS Tree Depth for Original Data")
plt.show()

In [None]:
# plotting the results
plt.plot(max_depths,train_accuracy,'b',label='Train Accuracy')
plt.plot(max_depths,test_accuracy,'r',label='Test Accuracy')
plt.legend(loc='best',frameon=False)
plt.ylabel("Accuracy score")
plt.xlabel("Tree depth")
plt.title("Accuracy VS Tree Depth for Original Data")
plt.show()

In [None]:
#---------feature selection by svm method----------------
# In this section SVM method is used for feature selection


# define model for feature selection
m=LinearSVC(C=0.000002,penalty='l1',dual=False)
# fitting the model on the data set
clf=m.fit(x_train,y_train)
s=SelectFromModel(clf,prefit=True)
Xnew=s.transform(x_train)
print("Xnew",Xnew[0:5,:],end='\n\n')
print("features 0,1,3,4,8 are the most effective")

# dropping the unimportant features from dataset
#df2=df_original.drop(['MWG','NWG','KWG','MDIMC','NDIMC','MDIMA','NDIMB','KWI','VWM','VWN','STRM','STRN','SA','SB'])
df2=df_original.drop(columns=['KWG','MDIMA','NDIMB','KWI','VWN','STRM','STRN','SA','SB'])

# calculatiing GPU run time average
df2["RunF"]=(df2["Run1 (ms)"]+df2["Run2 (ms)"]+ df2["Run3 (ms)"]+df2["Run4 (ms)"])/4
df2=df2.drop(columns=['Run1 (ms)','Run2 (ms)','Run3 (ms)','Run4 (ms)'])
# putting average runtime in a new target column
df2['RunF'].mean()
#df2['RunF_binary'] = np.where(df2['RunF']>=217.5, 1, 0)
df2['RunF_binary']=np.where(df2['RunF']>=100,1,0)

In [None]:
# these are data frames for reduced features
x_train_RD=train.iloc[:,0:5]
y_train_RD=train.iloc[:,-1]
x_test_RD=test.iloc[:,0:5]
y_test_RD=test.iloc[:,-1]

In [None]:
#applying decision tree classifier on the train data set
clf=DecisionTreeClassifier(criterion='gini')
clf=clf.fit(x_train_RD,y_train_RD)
y_pred_train_RD=clf.predict(x_train_RD)
y_pred_test_RD=clf.predict(x_test_RD)

print("accuracy_score(y_train_RD,y_pred_trai_RDn) is: \n",
      accuracy_score(y_train_RD,y_pred_train_RD),end='\n\n')
print("accuracy_score(y_test_RD,y_pred_test_RD) is: \n",
      accuracy_score(y_test_RD,y_pred_test_RD),end='\n\n')
print("onfusion_matrix( y_pred_train_RD,y_train_RD) is: \n",
      confusion_matrix( y_pred_train_RD,y_train_RD),end='\n\n')
print("confusion_matrix( y_pred_test_RD,y_test_RD) is: \n",
      confusion_matrix( y_pred_test_RD,y_test_RD),end='\n\n')

In [None]:
# calculating accuracy and AUC of train and test data
test_accuracy_RD,train_accuracy_RD,test_AUC_RD,train_AUC_RD=dtmetrics(x_tr=x_train_RD,x_ts=x_test_RD,y_tr=y_train_RD,y_ts=y_test_RD,max_depth=max_depths)

In [None]:
# plotting AUC versus tree depth
plt.figure(30)
plt.plot(max_depths,train_AUC_RD,'b',label='Train AUC')
plt.plot(max_depths,test_AUC_RD,'r',label='Test AUC')
plt.legend(loc='best',frameon=False)
plt.ylabel("AUC score")
plt.xlabel("Tree depth")
plt.title("AUC VS Tree Depth for Selected Features")
plt.show()

In [None]:
#plotting Accuracy versus tree depth
plt.figure(30)
plt.plot(max_depths,train_accuracy_RD,'b',label='Train Accuracy')
plt.plot(max_depths,test_accuracy_RD,'r',label='Test Accuracy')
plt.legend(loc='best',frameon=False)
plt.ylabel("Accuracy score")
plt.xlabel("Tree depth")
plt.title("Accuracy VS Tree Depth for Selected Features")
plt.show()