# Lab 04

### Sudarsan Haridas

### 300353099

**LOAD THE PYTHON LIBRARY**

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import RobustScaler

from sklearn.feature_selection import SelectFromModel

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import confusion_matrix, classification_report

#importing necessary python libraries...

**READ THE DATA**

In [2]:
df = pd.read_csv('Lab4_user_behaviour.csv')

df_copy = df.copy()

#reading the data and creating a copy of the original...

In [3]:
df.head()

#peeking into the data...

#TARGET : classification

Unnamed: 0,inter_api_access_duration(sec),api_access_uniqueness,sequence_length(count),vsession_duration(min),ip_type,num_sessions,num_users,num_unique_apis,source,classification
0,7.010387,0.419355,31.0,13040,default,1,1,13,E,1
1,51.419393,0.252336,107.0,330113,default,1,1,27,E,1
2,25.860775,0.275,40.0,62066,default,1,1,11,E,1
3,0.205909,0.818182,11.0,136,default,1,1,9,E,1
4,0.122125,0.8125,16.0,118,default,1,1,13,E,1


In [4]:
df.classification.value_counts()

#Number of entries in each Class

1    24734
0    10052
2     1309
3       22
Name: classification, dtype: int64

**DROP UNNEEDED DATA**

In [5]:
df.info()

#There are no null values...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36117 entries, 0 to 36116
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   inter_api_access_duration(sec)  36117 non-null  float64
 1   api_access_uniqueness           36117 non-null  float64
 2   sequence_length(count)          36117 non-null  float64
 3   vsession_duration(min)          36117 non-null  int64  
 4   ip_type                         36117 non-null  object 
 5   num_sessions                    36117 non-null  int64  
 6   num_users                       36117 non-null  int64  
 7   num_unique_apis                 36117 non-null  int64  
 8   source                          36117 non-null  object 
 9   classification                  36117 non-null  int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 2.8+ MB


In [6]:
target = df.classification

features = df.drop('classification', axis = 1)

#splitting features and target variable..

**CREATE DUMMY VALUES FOR DATAFRAME**

In [7]:
features = pd.get_dummies(features, drop_first = True)

#get dummy columns for categorical columns in features...

**PREPARE FEATURES AND TARGET VARIABLE**

In [8]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.25, random_state = 99)

#splitting train and test data..

**CREATING PIPELINE**

In [9]:
name_of_model = ['Logistic Regression', 'KNN', 'Linear SVC', 'SVM RBF', 'Decision Tree', 
                'Naive Bayes', 'Random Forest Classifier', 'AdaBoost Classifier', 
                'XGBoost Classifier', 'CatBoost Classifier']

score_of_model = []


list_of_models = [
    LogisticRegression(max_iter = 10000),
    KNeighborsClassifier(),
    SVC(kernel = "linear", C = 1),
    SVC(kernel = "rbf", C = 1, gamma = "auto"),
    DecisionTreeClassifier(max_depth = 5),
    GaussianNB(),
    RandomForestClassifier(max_depth = 5, n_estimators = 100),
    AdaBoostClassifier(),
    XGBClassifier(),
    CatBoostClassifier(learning_rate = 0.1,verbose = False)
]


#list of model names..
#empty list to hold score of each model...
#List of models

In [10]:
for i, clf in enumerate(list_of_models):
    
    pipe = Pipeline(
    [
        ('Scaler', RobustScaler()),
        ('Feature Selection', SelectFromModel(estimator = RandomForestClassifier( n_estimators = 100))),
        (name_of_model[i], clf)
    ]
    )
    
    pipe.fit(x_train, y_train)
    
    score_of_model.append(pipe.score(x_test, y_test))
    
    
    
#creating a pipeline of feature scaling, selection and classifier models...

In [11]:
scores = pd.DataFrame(zip(name_of_model, score_of_model), columns = ['Classifier', 'Accuracy Score'])

#creating a dataframe of model scores...

**SELECT BEST PIPE**

In [12]:
scores.sort_values( by = ['Accuracy Score'], ascending = False)

#sorting models by accuracy score..

Unnamed: 0,Classifier,Accuracy Score
9,CatBoost Classifier,0.987154
8,XGBoost Classifier,0.987043
6,Random Forest Classifier,0.985714
2,Linear SVC,0.984607
1,KNN,0.982614
3,SVM RBF,0.981063
4,Decision Tree,0.966999
0,Logistic Regression,0.964341
7,AdaBoost Classifier,0.962569
5,Naive Bayes,0.728571


XGBoost Classifier is the best model...

In [13]:
best_clf_pipe = Pipeline(
    [
        ('Scaler', RobustScaler()),
        ('Feature Selection', SelectFromModel(estimator = RandomForestClassifier( n_estimators = 100))),
        ('XGB Classifier', XGBClassifier())
    ]
    )

best_clf_pipe.fit(x_train, y_train)

y_pred = best_clf_pipe.predict(x_test)

#predicting using best classifier model..

In [14]:
np.unique(y_pred)

array([0, 1, 2])

In [15]:
print(confusion_matrix(y_test, y_pred))

#printing confusion matrix..

[[2544    0    1    0]
 [   0 6147    0    0]
 [   2  106  222    0]
 [   0    8    0    0]]


    - 2 'Class 2' values were predicted to be 'Class 0'
    - 106 'Class 2' values were predicted to be 'Class 1'
    - 8 'Class 3' values were predicted to be 'Class 1'
    - 1 'Class 0' values were predicted to be 'Class 2'

These were all the false predictions... 

117 false predictions

8913 accurate predictions

In [16]:
print(classification_report(y_test, y_pred, zero_division = 0.0))

#printing classification report..

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2545
           1       0.98      1.00      0.99      6147
           2       1.00      0.67      0.80       330
           3       0.00      0.00      0.00         8

    accuracy                           0.99      9030
   macro avg       0.74      0.67      0.70      9030
weighted avg       0.99      0.99      0.99      9030



    - The overall accuracy of the model is very good
    - The precision and recall values for 'Class 3' are 0 because the count of 'Class 3' values is 0.
    