In [11]:
import pandas as  pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score
import sys




In [12]:
def showMetrics(y_test,y_predictied):
    conMat=confusion_matrix(y_test,y_predictied)
    TP=conMat[1,1]
    TN=conMat[0,0]
    FP=conMat[0,1]
    FN=conMat[1,0]
    print ("-"*40)
    print ("Model Scores")
    print ("TotalNumber of tests",conMat.sum())
    print ("TP",TP)
    print ("TN",TN)
    print ("FP",FP)
    print ("FN",FN)

    print ("Accuracy :",accuracy_score(y_test,y_predictied))

    print ("\n\n*** Positive class - majority in training *** ")
    print ("Recall_True Posiive Rate :",recall_score(y_test,y_predictied)) # (TP/(TP+FN))
    print ("Precision_True Positive Rate :",precision_score(y_test,y_predictied)) #(TP/(Tp+FP))
    f1Positive=f1_score(y_test,y_predictied)
    print(  "F1 positive :",f1_score(y_test,y_predictied))
    
    
    print ("\n\n*** Negative class - minority in training *** ")
    RecallNegative=(TN/(TN+FP))
    print ("Recall_True Negative Rate :(TN/(TN+FP)) ",RecallNegative) # (TN/(TN+FP))
    precisionNegative=(TN/(TN+FN))
    print ("Precision_True Negative Rate : # (TN/(TN+FN))",precisionNegative)  # (TN/(TN+FN))
    f1Negative=2*((RecallNegative*precisionNegative)/(RecallNegative+precisionNegative))
    print(  "F1 Negative :", f1Negative  )
    
    print (    "\n\n\n ******************** Avg F1 score",  (f1Positive+f1Negative)/2,'    **************************')
    print ("-"*40)


    

# ***** ***** ****** *****
# Steps
# ***** ***** ****** *****

- Loading the data
- Splitting the data to x_train , y_train , x_test , y_test
- Data Normalization
- PCA and thats for two reason
   - PCA can be used to reduce dimensios but in case of our data it isn't problem (because after doing OneHotEncodeing the number of columns is 43 so it is not large number)
   - So i used PCA to reduce noise in the data by focusing on Princdipale components and remove components with low varaince (this in some cases can decrease accuracy because we lose some information but in our case accuracy have been improved ) 
- Use Support Vector Machine as the classifier (SVM)
  - USe gaussian as the kernel because i think the data can be non linear separble 
  - choose class:weight 0 to be 2 to give more importance to minority class to try also reduce the effect of data imbalancing
  - using grid search technique try to search for best combinatio of c and gamma parameters of SVM and with some hand searching i choosed gamma = 0.0004 and c=0.1 (meaning of these values that  that i want the model to give the priority to choose good hyperplane rather than classify correctly on train data (beacuase i belive have large noise))
- ## Performance Criteria
  - To judge the classifier i will not take into account accuracy because it can be misleading because the data is unbalanced i will compute f1 scores (because f1 score take into account recall and precision for specific class ) for both classes and take the average and this is my performance criteria 
  
   

In [13]:
train = pd.read_csv("training_processed.csv",sep=';')
test = pd.read_csv("validation_processed.csv",sep=';')

train.drop("Unnamed: 0", axis=1, inplace=True)
test.drop("Unnamed: 0", axis=1, inplace=True)




In [14]:
## data splitting 

x_train=train.drop("classLabel",axis=1)
y_train=train["classLabel"]

x_test=test.drop("classLabel",axis=1)
y_test=test["classLabel"]

x_train.head()

Unnamed: 0,variable2,variable3,variable8,variable11,variable14,variable15,variable1_0,variable1_1,variable4_0,variable4_1,...,variable7_8,variable9_0,variable9_1,variable10_0,variable10_1,variable12_0,variable12_1,variable13_0,variable13_1,variable13_2
0,17.92,5.4e-05,1.75,1.0,80.0,5.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,16.92,3.4e-05,0.29,0.0,200.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,31.25,0.000112,0.0,1.0,96.0,19.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
3,48.17,0.000133,0.335,0.0,0.0,120.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,32.33,0.00035,0.5,0.0,232.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


# PCA


In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(x_train)
# Apply transform to both the training set and the test set.
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

x_train



  import sys


array([[-1.19804502e+00, -9.32922311e-01, -1.86173583e-01, ...,
         3.31913453e-01, -1.21361807e-01, -3.04741530e-01],
       [-1.28585921e+00, -9.75798843e-01, -5.96345844e-01, ...,
        -3.59747012e+00, -1.21361807e-01,  4.00987577e+00],
       [-2.74818101e-02, -8.10567328e-01, -6.77818417e-01, ...,
         3.31913453e-01, -1.21361807e-01, -3.04741530e-01],
       ...,
       [ 4.66650739e-04,  8.05463847e-01, -4.68791198e-01, ...,
         3.31913453e-01, -1.21361807e-01, -3.04741530e-01],
       [-4.53506090e-02, -8.39157418e-01, -6.40761498e-01, ...,
         3.31913453e-01, -1.21361807e-01, -3.04741530e-01],
       [-9.55727912e-01,  8.58211139e-02, -5.64476808e-01, ...,
        -5.82381190e-01, -1.21361807e-01,  6.99189851e-01]])

In [16]:
from sklearn.decomposition import PCA
pca = PCA(27)                              # choosing by experiement
pca.fit(x_train)

x_train = pca.transform(x_train)
x_test = pca.transform(x_test)





# Learning

## support vector machine

In [17]:
"""
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC # "Support Vector Classifier" 

parameter_candidates = [
  {'C': [1, 10, 100, 1000,1000], 'gamma': [0.001, 0.0004,0.00001], 'kernel': ['rbf']},
]

# Create a classifier object with the classifier and parameter candidates
clf = GridSearchCV(estimator=SVC(class_weight={0:2}), param_grid=parameter_candidates, n_jobs=-1,verbose=10)

# Train the classifier on data1's feature and target data
clf.fit(x_train, y_train)   
print('Best score for data1:', clf.best_score_) 
clf.best_estimator_

"""


'\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.svm import SVC # "Support Vector Classifier" \n\nparameter_candidates = [\n  {\'C\': [1, 10, 100, 1000,1000], \'gamma\': [0.001, 0.0004,0.00001], \'kernel\': [\'rbf\']},\n]\n\n# Create a classifier object with the classifier and parameter candidates\nclf = GridSearchCV(estimator=SVC(class_weight={0:2}), param_grid=parameter_candidates, n_jobs=-1,verbose=10)\n\n# Train the classifier on data1\'s feature and target data\nclf.fit(x_train, y_train)   \nprint(\'Best score for data1:\', clf.best_score_) \nclf.best_estimator_\n\n'

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC # "Support Vector Classifier" 
clf = SVC(kernel='rbf', class_weight={0:2},gamma=0.0004,C=0.1,tol=1e-9) 
#clf = SVC(kernel='rbf', class_weight={0:2},C=2000) 
      
# fitting x samples and y classes 
clf.fit(x_train, y_train)


SVC(C=0.1, cache_size=200, class_weight={0: 2}, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0004, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=1e-09, verbose=False)

In [19]:
y_predictied=clf.predict(x_test)


# Metrics

In [20]:
showMetrics(y_test,y_predictied)

----------------------------------------
Model Scores
TotalNumber of tests 200
TP 83
TN 95
FP 12
FN 10
Accuracy : 0.89


*** Positive class - majority in training *** 
Recall_True Posiive Rate : 0.8924731182795699
Precision_True Positive Rate : 0.8736842105263158
F1 positive : 0.8829787234042553


*** Negative class - minority in training *** 
Recall_True Negative Rate :(TN/(TN+FP))  0.8878504672897196
Precision_True Negative Rate : # (TN/(TN+FN)) 0.9047619047619048
F1 Negative : 0.8962264150943396



 Avg F1 score 0.8896025692492975
----------------------------------------
