## Imports

In [1]:
from collections import Counter, defaultdict
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import utils
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models.word2vec import Word2Vec
from gensim.models import Word2Vec
import gensim
import re
import numpy as np
import pylab as pl

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve, GridSearchCV
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")


%matplotlib inline

  from pandas import Panel


In [2]:
# imports needed and logging
import gzip
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

## Reading Data 

### Reading the BNC written data 

### Spliting data into train/dev/test set

In [3]:
DATA_SET_PATH = "dataset/input.txt"
X, y = [], []
with open(DATA_SET_PATH, "r") as infile:
    for line in infile:
        text, label = line.split("\t")
        # texts are already tokenized, just split on space
        # in a real case we would use e.g. spaCy for tokenization
        # and maybe remove stopwords etc.
        X.append(text.split())
        y.append(label)
X, y = np.array(X), np.array(y)

TRAIN_SET_PATH = "dataset/train_set.txt"
DEV_SET_PATH = "dataset/dev_set.txt"
TEST_SET_PATH = "dataset/test_set.txt"

X_train, y_train = [], []
with open(TRAIN_SET_PATH, "r") as infile:
    for line in infile:
        text, label = line.split("\t")
        # texts are already tokenized, just split on space
        # in a real case we would use e.g. spaCy for tokenization
        # and maybe remove stopwords etc.
        X_train.append(text.split())
        y_train.append(label)
X_train, y_train = np.array(X_train), np.array(y_train)

X_val, y_val = [], []
with open(DEV_SET_PATH, "r") as infile:
    for line in infile:
        text, label = line.split("\t")
        # texts are already tokenized, just split on space
        # in a real case we would use e.g. spaCy for tokenization
        # and maybe remove stopwords etc.
        X_val.append(text.split())
        y_val.append(label)
X_val, y_val = np.array(X_val), np.array(y_val)

X_test, y_test = [], []
with open(TEST_SET_PATH, "r") as infile:
    for line in infile:
        text, label = line.split("\t")
        # texts are already tokenized, just split on space
        # in a real case we would use e.g. spaCy for tokenization
        # and maybe remove stopwords etc.
        X_test.append(text.split())
        y_test.append(label)
X_test, y_test = np.array(X_test), np.array(y_test)

for i in range(0, len(y) - 1):
    y[i] = y[i][:-1]

for i in range(0, len(y_train)):
    y_train[i] = y_train[i][:-1]
for i in range(0, len(y_train)):
    if y_train[i] == "Goal" or y_train[i] == "Cmpl" or y_train[i] == "Refl":
        y_train[i] = "GCR"

for i in range(0, len(y_val)):
    y_val[i] = y_val[i][:-1]
for i in range(0, len(y_val)):
    if y_val[i] == 'Goal' or y_val[i] == 'Cmpl' or y_val[i] == "Refl":
        y_val[i] = 'GCR'
for i in range(0, len(y_test)):
    y_test[i] = y_test[i][:-1]
for i in range(0, len(y_test)):
    if y_test[i] == 'Goal' or y_test[i] == 'Cmpl' or y_test[i] == "Refl":
        y_test[i] = 'GCR'

In [7]:
y_train

array(['GCR', 'GCR', 'GCR', 'Vert', 'GCR', 'GCR', 'Vert', 'Vert', 'GCR',
       'Vert', 'GCR', 'Vert', 'GCR', 'GCR', 'GCR', 'GCR', 'GCR', 'GCR',
       'GCR', 'GCR', 'GCR', 'GCR', 'Vert', 'GCR', 'Vert', 'Vert', 'GCR',
       'GCR', 'Vert', 'Vert', 'GCR', 'Vert', 'Vert', 'Vert', 'GCR',
       'Vert', 'Vert', 'GCR', 'Vert', 'GCR', 'Vert', 'Vert', 'Vert',
       'GCR', 'Vert', 'GCR', 'GCR', 'GCR', 'Vert', 'GCR', 'Vert', 'GCR',
       'GCR', 'GCR', 'GCR', 'Vert', 'Vert', 'GCR', 'GCR', 'GCR'],
      dtype='<U5')

## Setting up particle features 

In [5]:
particle_features_df=pd.read_excel('particle_features.xlsx')

In [6]:
particle_features={}
for index, row in particle_features_df.iterrows():
    particle_features[particle_features_df.iloc[index]['verb']]=particle_features_df.loc[index,['about', 'along', 'around', 'back', 'by', 'down', 'in', 'off',
                          'on', 'out', 'over', 'round', 'through', 'under', 'up', 'up_0', 'up_1',
                          'up_2', 'up_3', 'up_4', 'up_5']].to_numpy(dtype='float32')
particle_features['fire']

array([0.        , 0.00512821, 0.00512821, 0.14358975, 0.        ,
       0.02051282, 0.03076923, 0.04615385, 0.15384616, 0.08205128,
       0.00512821, 0.00512821, 0.01025641, 0.        , 0.4923077 ,
       0.84375   , 0.10416666, 0.02083333, 0.        , 0.01041667,
       0.02083333], dtype=float32)

In [6]:
combined_features_V={}
for k in particle_features.keys():
    modified_key= str(k)+'_V'
    combined_features_V[modified_key]=particle_features[k]

In [7]:
combined_features_VPC={}
for k in particle_features.keys():
    modified_key= str(k)+'_VPC'
    combined_features_VPC[modified_key]=particle_features[k]

## Getting the word embeddings

### training word2vec on BNC corpus 

#### CBOW

In [8]:
X_train_list = []
# for l in d0:
for l in X_train:
    X_train_list_item = []
    for t in l:
        t = str(t)+'_VPC'
        X_train_list_item.append(t)
    X_train_list.append(X_train_list_item)
# X_val_list = []
# # for l in d0:
# for l in X_val:
#     X_val_list_item = []
#     for t in l:
#         t = str(t)+'_VPC'
#         X_val_list_item.append(t)
#     X_val_list.append(X_val_list_item)
X_val_list = []
# for l in d0:
for l in X_test:
    X_val_list_item = []
    for t in l:
        t = str(t)+'_VPC'
        X_val_list_item.append(t)
    X_val_list.append(X_val_list_item)

X_train_tagged_VPC = np.array(X_train_list)
X_val_tagged_VPC = np.array(X_val_list)

X_train_list = []
# for l in d0:
for l in X_train:
    X_train_list_item = []
    for t in l:
        t = str(t)+'_V'
        X_train_list_item.append(t)
    X_train_list.append(X_train_list_item)
# X_val_list = []
# # for l in d0:
# for l in X_val:
#     X_val_list_item = []
#     for t in l:
#         t = str(t)+'_V'
#         X_val_list_item.append(t)
#     X_val_list.append(X_val_list_item)
X_val_list = []
# for l in d0:
for l in X_test:
    X_val_list_item = []
    for t in l:
        t = str(t)+'_V'
        X_val_list_item.append(t)
    X_val_list.append(X_val_list_item)


X_train_tagged_V = np.array(X_train_list)
X_val_tagged_V = np.array(X_val_list)

In [9]:
model_BNC_sentences = Word2Vec.load("model_BNC_sentences.model")
model_BNC_tagged_sentences_particle_cw = Word2Vec.load(
    "model_BNC_tagged_sentences_particle_cw_v2.model")
model_BNC_tagged_sentences_verb_cw = Word2Vec.load(
    "model_BNC_tagged_sentences_verb_cw_v2.model")
model_BNC_tagged_sentences_particle_up_cw = Word2Vec.load(
    "model_BNC_tagged_sentences_particle_up_cw.model")

2020-04-19 12:55:21,595 : INFO : loading Word2Vec object from model_BNC_sentences.model
2020-04-19 12:55:22,014 : INFO : loading wv recursively from model_BNC_sentences.model.wv.* with mmap=None
2020-04-19 12:55:22,015 : INFO : loading vectors from model_BNC_sentences.model.wv.vectors.npy with mmap=None
2020-04-19 12:55:22,437 : INFO : setting ignored attribute vectors_norm to None
2020-04-19 12:55:22,438 : INFO : loading vocabulary recursively from model_BNC_sentences.model.vocabulary.* with mmap=None
2020-04-19 12:55:22,438 : INFO : loading trainables recursively from model_BNC_sentences.model.trainables.* with mmap=None
2020-04-19 12:55:22,439 : INFO : loading syn1neg from model_BNC_sentences.model.trainables.syn1neg.npy with mmap=None
2020-04-19 12:55:22,883 : INFO : setting ignored attribute cum_table to None
2020-04-19 12:55:22,884 : INFO : loaded model_BNC_sentences.model
2020-04-19 12:55:23,319 : INFO : loading Word2Vec object from model_BNC_tagged_sentences_particle_cw_v2.mode

In [10]:
# def transform_with_combine_features_cw(x):
#     return np.array([
#         np.concatenate(([model_BNC_sentences[w]
#                  for w in words if w in model_BNC_sentences]or [np.zeros(model_BNC_sentences['balance'].shape)],
#                        [particle_features[w]
#                  for w in words if w in particle_features]or [np.zeros(particle_features['balance'].shape)]),
#                 axis=None) for words in x
#     ]) 
# X_train_combined_cw=transform_with_combine_features_cw(X_train)
# X_val_combined_cw=transform_with_combine_features_cw(X_val)

# def transform_with_combine_features_cw_VPC(x):
#     return np.array([
#         np.concatenate(([model_BNC_tagged_sentences_particle_cw[w]
#                  for w in words if w in model_BNC_tagged_sentences_particle_cw]or [np.zeros(model_BNC_tagged_sentences_particle_cw['balance_VPC'].shape)],
#                        [combined_features_VPC[w]
#                  for w in words if w in combined_features_VPC]or [np.zeros(combined_features_VPC['balance_VPC'].shape)]),
#                 axis=None) for words in x
#     ]) 
# X_train_combined_VPC_cw=transform_with_combine_features_cw_VPC(X_train_tagged_VPC)
# X_val_combined_VPC_cw=transform_with_combine_features_cw_VPC(X_val_tagged_VPC)

# def transform_with_combine_features_cw_VPC_up(x):
#     return np.array([
#         np.concatenate(([model_BNC_tagged_sentences_particle_up_cw[w]
#                  for w in words if w in model_BNC_tagged_sentences_particle_up_cw]or [np.zeros(model_BNC_tagged_sentences_particle_up_cw['balance_VPC'].shape)],
#                        [combined_features_VPC[w]
#                  for w in words if w in combined_features_VPC]or [np.zeros(combined_features_VPC['balance_VPC'].shape)]),
#                 axis=None) for words in x
#     ]) 
# X_train_combined_VPC_cw_up=transform_with_combine_features_cw_VPC_up(X_train_tagged_VPC)
# X_val_combined_VPC_cw_up=transform_with_combine_features_cw_VPC_up(X_val_tagged_VPC)

# def transform_with_combine_features_cw_V(x):
#     return np.array([
#         np.concatenate(([model_BNC_tagged_sentences_verb_cw[w]
#                  for w in words if w in model_BNC_tagged_sentences_verb_cw]or [np.zeros(model_BNC_tagged_sentences_verb_cw['balance_V'].shape)],
#                        [combined_features_V[w]
#                  for w in words if w in combined_features_V]or [np.zeros(combined_features_V['balance_V'].shape)]),
#                 axis=None) for words in x
#     ]) 
# X_train_combined_V_cw=transform_with_combine_features_cw_V(X_train_tagged_V)
# X_val_combined_V_cw=transform_with_combine_features_cw_V(X_val_tagged_V)



In [11]:
def transform_with_BNC(x):
    return np.array([
        np.mean([model_BNC_sentences[w]
                 for w in words if w in model_BNC_sentences]or [np.zeros(model_BNC_sentences['balance'].shape)],
                axis=0) for words in x
    ])
X_train_updated_BNC = transform_with_BNC(X_train)
# X_val_updated_BNC = transform_with_BNC(X_val)
X_val_updated_BNC = transform_with_BNC(X_test)

def transform_with_BNC_v(x):
    return np.array([
        np.mean([model_BNC_tagged_sentences_verb_cw[w]
                 for w in words if w in model_BNC_tagged_sentences_verb_cw]or [np.zeros(model_BNC_tagged_sentences_verb_cw['balance_V'].shape)],
                axis=0) for words in x
    ])
X_train_updated_BNC_V = transform_with_BNC_v(X_train_tagged_V)
X_val_updated_BNC_V = transform_with_BNC_v(X_val_tagged_V)

def transform_with_BNC_up(x):
    return np.array([
        np.mean([model_BNC_tagged_sentences_particle_up_cw[w]
                 for w in words if w in model_BNC_tagged_sentences_particle_up_cw]or [np.zeros(model_BNC_tagged_sentences_particle_up_cw['balance_VPC'].shape)],
                axis=0) for words in x
    ])
X_train_updated_BNC_VPC_up = transform_with_BNC_up(X_train_tagged_VPC)
X_val_updated_BNC_VPC_up = transform_with_BNC_up(X_val_tagged_VPC)

def transform_with_BNC_vpc(x):
    return np.array([
        np.mean([model_BNC_tagged_sentences_particle_cw[w]
                 for w in words if w in model_BNC_tagged_sentences_particle_cw]or [np.zeros(model_BNC_tagged_sentences_particle_cw['balance_VPC'].shape)],
                axis=0) for words in x
    ])

X_train_updated_BNC_VPC = transform_with_BNC_vpc(X_train_tagged_VPC)
X_val_updated_BNC_VPC = transform_with_BNC_vpc(X_val_tagged_VPC)

def transform_with_particle_features(x):
    return np.array([
        np.mean([particle_features[w]
                 for w in words if w in particle_features]or [np.zeros(particle_features['balance'].shape)],
                axis=0) for words in x
    ])
X_train_particle_features = transform_with_particle_features(X_train)
# X_val_particle_features = transform_with_particle_features(X_val)
X_val_particle_features = transform_with_particle_features(X_test)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  """
  
  
  from ipykernel import kernelapp as app


In [12]:
X_train_combined_cw=np.concatenate((X_train_updated_BNC, X_train_particle_features), axis=1)
X_val_combined_cw=np.concatenate((X_val_updated_BNC, X_val_particle_features), axis=1)


X_train_combined_VPC_cw=np.concatenate((X_train_updated_BNC_VPC, X_train_particle_features), axis=1)
X_val_combined_VPC_cw=np.concatenate((X_val_updated_BNC_VPC, X_val_particle_features), axis=1)


X_train_combined_VPC_cw_up=np.concatenate((X_train_updated_BNC_VPC_up, X_train_particle_features), axis=1)
X_val_combined_VPC_cw_up=np.concatenate((X_val_updated_BNC_VPC_up, X_val_particle_features), axis=1)


X_train_combined_V_cw=np.concatenate((X_train_updated_BNC_V, X_train_particle_features), axis=1)
X_val_combined_V_cw=np.concatenate((X_val_updated_BNC_V, X_val_particle_features), axis=1)

In [13]:
X_train_combined_cw900D=np.concatenate((X_train_updated_BNC_V, X_train_updated_BNC_VPC,X_train_updated_BNC_VPC_up), axis=1)
X_val_combined_cw900D=np.concatenate((X_val_updated_BNC_V, X_val_updated_BNC_VPC,X_val_updated_BNC_VPC_up), axis=1)

In [14]:
X_train_combined_cw921D=np.concatenate((X_train_updated_BNC_V, X_train_updated_BNC_VPC,X_train_updated_BNC_VPC_up,X_train_particle_features), axis=1)
X_val_combined_cw921D=np.concatenate((X_val_updated_BNC_V, X_val_updated_BNC_VPC,X_val_updated_BNC_VPC_up,X_val_particle_features), axis=1)

#### Skip-gram

In [15]:
# X_train_list = []
# # for l in d0:
# for l in X_train:
#     X_train_list_item = []
#     for t in l:
#         t = str(t)+'_VPC'
#         X_train_list_item.append(t)
#     X_train_list.append(X_train_list_item)
# X_val_list = []
# # for l in d0:
# for l in X_val:
#     X_val_list_item = []
#     for t in l:
#         t = str(t)+'_VPC'
#         X_val_list_item.append(t)
#     X_val_list.append(X_val_list_item)


# X_train_tagged_VPC = np.array(X_train_list)
# X_val_tagged_VPC = np.array(X_val_list)

# X_train_list = []
# # for l in d0:
# for l in X_train:
#     X_train_list_item = []
#     for t in l:
#         t = str(t)+'_V'
#         X_train_list_item.append(t)
#     X_train_list.append(X_train_list_item)
# X_val_list = []
# # for l in d0:
# for l in X_val:
#     X_val_list_item = []
#     for t in l:
#         t = str(t)+'_V'
#         X_val_list_item.append(t)
#     X_val_list.append(X_val_list_item)


# X_train_tagged_V = np.array(X_train_list)
# X_val_tagged_V = np.array(X_val_list)

In [16]:
model_BNC_sentences_sg = Word2Vec.load("model_BNC_sentences_sg.model")
model_BNC_tagged_sentences_particle_sg = Word2Vec.load("model_BNC_tagged_sentences_particle_sg_v2.model")
model_BNC_tagged_sentences_particle_up_sg = Word2Vec.load("model_BNC_tagged_sentences_particle_up_sg.model")
model_BNC_tagged_sentences_verb_sg = Word2Vec.load("model_BNC_tagged_sentences_verb_sg_v2.model")

2020-04-19 12:55:28,657 : INFO : loading Word2Vec object from model_BNC_sentences_sg.model
2020-04-19 12:55:29,179 : INFO : loading wv recursively from model_BNC_sentences_sg.model.wv.* with mmap=None
2020-04-19 12:55:29,180 : INFO : loading vectors from model_BNC_sentences_sg.model.wv.vectors.npy with mmap=None
2020-04-19 12:55:29,610 : INFO : setting ignored attribute vectors_norm to None
2020-04-19 12:55:29,611 : INFO : loading vocabulary recursively from model_BNC_sentences_sg.model.vocabulary.* with mmap=None
2020-04-19 12:55:29,612 : INFO : loading trainables recursively from model_BNC_sentences_sg.model.trainables.* with mmap=None
2020-04-19 12:55:29,613 : INFO : loading syn1neg from model_BNC_sentences_sg.model.trainables.syn1neg.npy with mmap=None
2020-04-19 12:55:30,084 : INFO : setting ignored attribute cum_table to None
2020-04-19 12:55:30,085 : INFO : loaded model_BNC_sentences_sg.model
2020-04-19 12:55:30,525 : INFO : loading Word2Vec object from model_BNC_tagged_sentence

In [17]:
# def transform_with_combine_features_sg(x):
#     return np.array([
#         np.concatenate(([model_BNC_sentences_sg[w]
#                  for w in words if w in model_BNC_sentences_sg]or [np.zeros(model_BNC_sentences_sg['balance'].shape)],
#                        [particle_features[w]
#                  for w in words if w in particle_features]or [np.zeros(particle_features['balance'].shape)]),
#                 axis=None) for words in x
#     ]) 
# X_train_combined_sg=transform_with_combine_features_sg(X_train)
# X_val_combined_sg=transform_with_combine_features_sg(X_val)

# def transform_with_combine_features_sg_VPC(x):
#     return np.array([
#         np.concatenate(([model_BNC_tagged_sentences_particle_sg[w]
#                  for w in words if w in model_BNC_tagged_sentences_particle_sg]or [np.zeros(model_BNC_tagged_sentences_particle_sg['balance_VPC'].shape)],
#                        [combined_features_VPC[w]
#                  for w in words if w in combined_features_VPC]or [np.zeros(combined_features_VPC['balance_VPC'].shape)]),
#                 axis=None) for words in x
#     ]) 
# X_train_combined_VPC_sg=transform_with_combine_features_sg_VPC(X_train_tagged_VPC)
# X_val_combined_VPC_sg=transform_with_combine_features_sg_VPC(X_val_tagged_VPC)

# def transform_with_combine_features_sg_VPC_up(x):
#     return np.array([
#         np.concatenate(([model_BNC_tagged_sentences_particle_up_sg[w]
#                  for w in words if w in model_BNC_tagged_sentences_particle_up_sg]or [np.zeros(model_BNC_tagged_sentences_particle_up_sg['balance_VPC'].shape)],
#                        [combined_features_VPC[w]
#                  for w in words if w in combined_features_VPC]or [np.zeros(combined_features_VPC['balance_VPC'].shape)]),
#                 axis=None) for words in x
#     ]) 
# X_train_combined_VPC_sg_up=transform_with_combine_features_sg_VPC_up(X_train_tagged_VPC)
# X_val_combined_VPC_sg_up=transform_with_combine_features_sg_VPC_up(X_val_tagged_VPC)

# def transform_with_combine_features_sg_V(x):
#     return np.array([
#         np.concatenate(([model_BNC_tagged_sentences_verb_sg[w]
#                  for w in words if w in model_BNC_tagged_sentences_verb_sg]or [np.zeros(model_BNC_tagged_sentences_verb_sg['balance_V'].shape)],
#                        [combined_features_V[w]
#                  for w in words if w in combined_features_V]or [np.zeros(combined_features_V['balance_V'].shape)]),
#                 axis=None) for words in x
#     ]) 
# X_train_combined_V_sg=transform_with_combine_features_sg_V(X_train_tagged_V)
# X_val_combined_V_sg=transform_with_combine_features_sg_V(X_val_tagged_V)
def transform_with_BNC(x):
    return np.array([
        np.mean([model_BNC_sentences_sg[w]
                 for w in words if w in model_BNC_sentences_sg]or [np.zeros(model_BNC_sentences_sg['balance'].shape)],
                axis=0) for words in x
    ])
X_train_updated_BNC = transform_with_BNC(X_train)
# X_val_updated_BNC = transform_with_BNC(X_val)
X_val_updated_BNC = transform_with_BNC(X_test)

def transform_with_BNC_v(x):
    return np.array([
        np.mean([model_BNC_tagged_sentences_verb_sg[w]
                 for w in words if w in model_BNC_tagged_sentences_verb_sg]or [np.zeros(model_BNC_tagged_sentences_verb_sg['balance_V'].shape)],
                axis=0) for words in x
    ])
X_train_updated_BNC_V = transform_with_BNC_v(X_train_tagged_V)
X_val_updated_BNC_V = transform_with_BNC_v(X_val_tagged_V)

def transform_with_BNC_up(x):
    return np.array([
        np.mean([model_BNC_tagged_sentences_particle_up_sg[w]
                 for w in words if w in model_BNC_tagged_sentences_particle_up_sg]or [np.zeros(model_BNC_tagged_sentences_particle_up_sg['balance_VPC'].shape)],
                axis=0) for words in x
    ])
X_train_updated_BNC_VPC_up = transform_with_BNC_up(X_train_tagged_VPC)
X_val_updated_BNC_VPC_up = transform_with_BNC_up(X_val_tagged_VPC)

def transform_with_BNC_vpc(x):
    return np.array([
        np.mean([model_BNC_tagged_sentences_particle_sg[w]
                 for w in words if w in model_BNC_tagged_sentences_particle_sg]or [np.zeros(model_BNC_tagged_sentences_particle_sg['balance_VPC'].shape)],
                axis=0) for words in x
    ])

X_train_updated_BNC_VPC = transform_with_BNC_vpc(X_train_tagged_VPC)
X_val_updated_BNC_VPC = transform_with_BNC_vpc(X_val_tagged_VPC)

def transform_with_particle_features(x):
    return np.array([
        np.mean([particle_features[w]
                 for w in words if w in particle_features]or [np.zeros(particle_features['balance'].shape)],
                axis=0) for words in x
    ])
X_train_particle_features = transform_with_particle_features(X_train)
# X_val_particle_features = transform_with_particle_features(X_val)
X_val_particle_features = transform_with_particle_features(X_test)



In [18]:
X_train_combined_sg=np.concatenate((X_train_updated_BNC, X_train_particle_features), axis=1)
X_val_combined_sg=np.concatenate((X_val_updated_BNC, X_val_particle_features), axis=1)

 
X_train_combined_VPC_sg=np.concatenate((X_train_updated_BNC_VPC, X_train_particle_features), axis=1)
X_val_combined_VPC_sg=np.concatenate((X_val_updated_BNC_VPC, X_val_particle_features), axis=1)


X_train_combined_VPC_sg_up=np.concatenate((X_train_updated_BNC_VPC_up, X_train_particle_features), axis=1)
X_val_combined_VPC_sg_up=np.concatenate((X_val_updated_BNC_VPC_up, X_val_particle_features), axis=1)

X_train_combined_V_sg=np.concatenate((X_train_updated_BNC_V, X_train_particle_features), axis=1)
X_val_combined_V_sg=np.concatenate((X_val_updated_BNC_V, X_val_particle_features), axis=1)

In [19]:
X_train_combined_sg900D=np.concatenate((X_train_updated_BNC_V, X_train_updated_BNC_VPC,X_train_updated_BNC_VPC_up), axis=1)
X_val_combined_sg900D=np.concatenate((X_val_updated_BNC_V, X_val_updated_BNC_VPC,X_val_updated_BNC_VPC_up), axis=1)

In [20]:
X_train_combined_sg921D=np.concatenate((X_train_updated_BNC_V, X_train_updated_BNC_VPC,X_train_updated_BNC_VPC_up,X_train_particle_features), axis=1)
X_val_combined_sg921D=np.concatenate((X_val_updated_BNC_V, X_val_updated_BNC_VPC,X_val_updated_BNC_VPC_up,X_val_particle_features), axis=1)

In [21]:
len(X_val_combined_sg921D[1])

921

### google (300D) pre-trained word embedding 

In [22]:
model_google = gensim.models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin', binary=True)

2020-04-19 12:55:36,224 : INFO : loading projection weights from GoogleNews-vectors-negative300.bin
2020-04-19 12:56:58,182 : INFO : loaded (3000000, 300) matrix from GoogleNews-vectors-negative300.bin


### glove pre-trained word embedding

In [23]:
GLOVE_6B_50D_PATH = "glove.6B.50d.txt"
GLOVE_840B_300D_PATH = "glove.840B.300d.txt"
encoding = "utf-8"

In [24]:
import numpy as np
with open(GLOVE_6B_50D_PATH, "rb") as lines:
    wvec = {
        line.split()[0].decode(encoding): np.array(line.split()[1:],
                                                   dtype=np.float32)
        for line in lines
    }

In [25]:
# reading glove files, this may take a while
# we're reading line by line and only saving vectors
# that correspond to words from our training set
# if you wan't to play around with the vectors and have
# enough RAM - remove the 'if' line and load everything

import struct

glove_small = {}
all_words = set(w for words in X for w in words)
with open(GLOVE_6B_50D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if (word in all_words):
            nums = np.array(parts[1:], dtype=np.float32)
            glove_small[word] = nums

glove_big = {}
with open(GLOVE_840B_300D_PATH, "rb") as infile:
    for line in infile:
        parts = line.split()
        word = parts[0].decode(encoding)
        if word in all_words:
            nums = np.array(parts[1:], dtype=np.float32)
            glove_big[word] = nums

## Classification

In [26]:
my_tags = ['GCR', 'Vert']

### Training with BNC word embedding (CBOW)

#### General form

##### Linear SVM

In [27]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_cw, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
y_pred = clf.predict(X_val_combined_cw)

In [29]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  48.333333333333336


In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.44      0.59      0.51        27
        Vert       0.54      0.39      0.46        33

    accuracy                           0.48        60
   macro avg       0.49      0.49      0.48        60
weighted avg       0.50      0.48      0.48        60



##### Weighted Linear SVM

In [31]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_cw, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [32]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_cw)

In [33]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  48.333333333333336


In [34]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.44      0.59      0.51        27
        Vert       0.54      0.39      0.46        33

    accuracy                           0.48        60
   macro avg       0.49      0.49      0.48        60
weighted avg       0.50      0.48      0.48        60



##### SVC with 'rbf' kernel

In [35]:
X = X_train_combined_cw
Y = y_train

In [36]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [37]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [38]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 2.0, 'gamma': 0.001953125} with a score of 0.62


In [39]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001953125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [40]:
y_pred = clf.predict(X_val_combined_cw)

In [41]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 50.0)


In [42]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.47      0.85      0.61        27
        Vert       0.64      0.21      0.32        33

    accuracy                           0.50        60
   macro avg       0.55      0.53      0.46        60
weighted avg       0.56      0.50      0.45        60



##### Weighted SVC with 'rbf' kernel 

In [43]:
X = X_train_combined_cw
Y = y_train

In [44]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [45]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [46]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 2.0, 'gamma': 0.001953125} with a score of 0.62


In [47]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [48]:
y_pred = clf.predict(X_val_combined_cw)

In [49]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 45.0)


In [50]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.45      1.00      0.62        27
        Vert       0.00      0.00      0.00        33

    accuracy                           0.45        60
   macro avg       0.23      0.50      0.31        60
weighted avg       0.20      0.45      0.28        60



  _warn_prf(average, modifier, msg_start, len(result))


##### Logistice regression

In [51]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_cw, y_train)
y_pred = logreg.predict(X_val_combined_cw)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.48333333333333334
              precision    recall  f1-score   support

         GCR       0.44      0.59      0.51        27
        Vert       0.54      0.39      0.46        33

    accuracy                           0.48        60
   macro avg       0.49      0.49      0.48        60
weighted avg       0.50      0.48      0.48        60



##### Weighted logistice regression

In [52]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_cw, y_train)
y_pred = logreg.predict(X_val_combined_cw)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.48333333333333334
              precision    recall  f1-score   support

         GCR       0.44      0.59      0.51        27
        Vert       0.54      0.39      0.46        33

    accuracy                           0.48        60
   macro avg       0.49      0.49      0.48        60
weighted avg       0.50      0.48      0.48        60



#### _V

##### Linear SVM

In [53]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_V_cw, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [54]:
y_pred = clf.predict(X_val_combined_V_cw)

In [55]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  43.333333333333336


In [56]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.40      0.52      0.45        27
        Vert       0.48      0.36      0.41        33

    accuracy                           0.43        60
   macro avg       0.44      0.44      0.43        60
weighted avg       0.44      0.43      0.43        60



##### Weighted Linear SVM

In [57]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_V_cw, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [58]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_V_cw)

In [59]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  43.333333333333336


In [60]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.40      0.52      0.45        27
        Vert       0.48      0.36      0.41        33

    accuracy                           0.43        60
   macro avg       0.44      0.44      0.43        60
weighted avg       0.44      0.43      0.43        60



##### SVC with 'rbf' kernel

In [61]:
X = X_train_combined_V_cw
Y = y_train

In [62]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [63]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [64]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=32768.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=7.62939453125e-06,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [65]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 32768.0, 'gamma': 7.62939453125e-06} with a score of 0.67


In [66]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=32768.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=7.62939453125e-06,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [67]:
y_pred = clf.predict(X_val_combined_V_cw)

In [68]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 43.333333333333336)


In [69]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.40      0.52      0.45        27
        Vert       0.48      0.36      0.41        33

    accuracy                           0.43        60
   macro avg       0.44      0.44      0.43        60
weighted avg       0.44      0.43      0.43        60



##### Weighted SVC with 'rbf' kernel 

In [70]:
X = X_train_combined_V_cw
Y = y_train

In [71]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [72]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [73]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [74]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=32768.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=7.62939453125e-06,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [75]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 32768.0, 'gamma': 7.62939453125e-06} with a score of 0.67


In [76]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [77]:
y_pred = clf.predict(X_val_combined_V_cw)

In [78]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 46.666666666666664)


In [79]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.46      1.00      0.63        27
        Vert       1.00      0.03      0.06        33

    accuracy                           0.47        60
   macro avg       0.73      0.52      0.34        60
weighted avg       0.76      0.47      0.31        60



##### Logistice regression

In [80]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_V_cw, y_train)
y_pred = logreg.predict(X_val_combined_V_cw)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.48333333333333334
              precision    recall  f1-score   support

         GCR       0.42      0.41      0.42        27
        Vert       0.53      0.55      0.54        33

    accuracy                           0.48        60
   macro avg       0.48      0.48      0.48        60
weighted avg       0.48      0.48      0.48        60



##### Weighted logistice regression

In [81]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_V_cw, y_train)
y_pred = logreg.predict(X_val_combined_V_cw)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.4666666666666667
              precision    recall  f1-score   support

         GCR       0.41      0.41      0.41        27
        Vert       0.52      0.52      0.52        33

    accuracy                           0.47        60
   macro avg       0.46      0.46      0.46        60
weighted avg       0.47      0.47      0.47        60



#### _VPC

##### Linear SVM

In [82]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_VPC_cw, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [83]:
y_pred = clf.predict(X_val_combined_VPC_cw)

In [84]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  53.333333333333336


In [85]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.49      0.70      0.58        27
        Vert       0.62      0.39      0.48        33

    accuracy                           0.53        60
   macro avg       0.55      0.55      0.53        60
weighted avg       0.56      0.53      0.52        60



##### Weighted Linear SVM

In [86]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_VPC_cw, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [87]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_VPC_cw)

In [88]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  51.66666666666667


In [89]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.47      0.70      0.57        27
        Vert       0.60      0.36      0.45        33

    accuracy                           0.52        60
   macro avg       0.54      0.53      0.51        60
weighted avg       0.54      0.52      0.50        60



##### SVC with 'rbf' kernel

In [90]:
X = X_train_combined_VPC_cw
Y = y_train

In [91]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [92]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [93]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [94]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=512.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001953125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [95]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 512.0, 'gamma': 0.001953125} with a score of 0.77


In [96]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=512.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001953125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [97]:
y_pred = clf.predict(X_val_combined_VPC_cw)

In [98]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 55.00000000000001)


In [99]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.50      0.67      0.57        27
        Vert       0.62      0.45      0.53        33

    accuracy                           0.55        60
   macro avg       0.56      0.56      0.55        60
weighted avg       0.57      0.55      0.55        60



##### Weighted SVC with 'rbf' kernel 

In [100]:
X = X_train_combined_VPC_cw
Y = y_train

In [101]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [102]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [103]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [104]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=512.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001953125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [105]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 512.0, 'gamma': 0.001953125} with a score of 0.77


In [106]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [107]:
y_pred = clf.predict(X_val_combined_VPC_cw)

In [108]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 60.0)


In [109]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.55      0.59      0.57        27
        Vert       0.65      0.61      0.62        33

    accuracy                           0.60        60
   macro avg       0.60      0.60      0.60        60
weighted avg       0.60      0.60      0.60        60



##### Logistice regression

In [110]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_VPC_cw, y_train)
y_pred = logreg.predict(X_val_combined_VPC_cw)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.48333333333333334
              precision    recall  f1-score   support

         GCR       0.45      0.63      0.52        27
        Vert       0.55      0.36      0.44        33

    accuracy                           0.48        60
   macro avg       0.50      0.50      0.48        60
weighted avg       0.50      0.48      0.48        60



##### Weighted logistice regression

In [111]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_VPC_cw, y_train)
y_pred = logreg.predict(X_val_combined_VPC_cw)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.5166666666666667
              precision    recall  f1-score   support

         GCR       0.47      0.63      0.54        27
        Vert       0.58      0.42      0.49        33

    accuracy                           0.52        60
   macro avg       0.53      0.53      0.52        60
weighted avg       0.53      0.52      0.51        60



#### _VPC (up)

##### Linear SVM

In [112]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_VPC_cw_up, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [113]:
y_pred = clf.predict(X_val_combined_VPC_cw_up)

In [114]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  53.333333333333336


In [115]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.49      0.74      0.59        27
        Vert       0.63      0.36      0.46        33

    accuracy                           0.53        60
   macro avg       0.56      0.55      0.52        60
weighted avg       0.57      0.53      0.52        60



##### Weighted Linear SVM

In [116]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_VPC_cw_up, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [117]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_VPC_cw_up)

In [118]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  53.333333333333336


In [119]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.49      0.74      0.59        27
        Vert       0.63      0.36      0.46        33

    accuracy                           0.53        60
   macro avg       0.56      0.55      0.52        60
weighted avg       0.57      0.53      0.52        60



##### SVC with 'rbf' kernel

In [120]:
X = X_train_combined_VPC_cw_up
Y = y_train

In [121]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [122]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [123]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [124]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [125]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 2.0, 'gamma': 0.125} with a score of 0.72


In [126]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [127]:
y_pred = clf.predict(X_val_combined_VPC_cw_up)

In [128]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 55.00000000000001)


In [129]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.50      0.67      0.57        27
        Vert       0.62      0.45      0.53        33

    accuracy                           0.55        60
   macro avg       0.56      0.56      0.55        60
weighted avg       0.57      0.55      0.55        60



##### Weighted SVC with 'rbf' kernel 

In [130]:
X = X_train_combined_VPC_cw_up
Y = y_train

In [131]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [132]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [133]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [134]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [135]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 2.0, 'gamma': 0.125} with a score of 0.72


In [136]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [137]:
y_pred = clf.predict(X_val_combined_VPC_cw_up)

In [138]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 58.333333333333336)


In [139]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.53      0.67      0.59        27
        Vert       0.65      0.52      0.58        33

    accuracy                           0.58        60
   macro avg       0.59      0.59      0.58        60
weighted avg       0.60      0.58      0.58        60



##### Logistice regression

In [140]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_VPC_cw_up, y_train)
y_pred = logreg.predict(X_val_combined_VPC_cw_up)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.5666666666666667
              precision    recall  f1-score   support

         GCR       0.51      0.78      0.62        27
        Vert       0.68      0.39      0.50        33

    accuracy                           0.57        60
   macro avg       0.60      0.59      0.56        60
weighted avg       0.61      0.57      0.55        60



##### Weighted logistice regression

In [141]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_VPC_cw_up, y_train)
y_pred = logreg.predict(X_val_combined_VPC_cw_up)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.5833333333333334
              precision    recall  f1-score   support

         GCR       0.52      0.81      0.64        27
        Vert       0.72      0.39      0.51        33

    accuracy                           0.58        60
   macro avg       0.62      0.60      0.57        60
weighted avg       0.63      0.58      0.57        60



#### 900D combined

##### Linear SVM

In [142]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_cw900D, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [143]:
y_pred = clf.predict(X_val_combined_cw900D)

In [144]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  50.0


In [145]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.45      0.52      0.48        27
        Vert       0.55      0.48      0.52        33

    accuracy                           0.50        60
   macro avg       0.50      0.50      0.50        60
weighted avg       0.51      0.50      0.50        60



##### Weighted Linear SVM

In [146]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_cw900D, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [147]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_cw900D)

In [148]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  50.0


In [149]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.45      0.52      0.48        27
        Vert       0.55      0.48      0.52        33

    accuracy                           0.50        60
   macro avg       0.50      0.50      0.50        60
weighted avg       0.51      0.50      0.50        60



##### SVC with 'rbf' kernel

In [150]:
X = X_train_combined_cw900D
Y = y_train

In [151]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [152]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [153]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [154]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=128.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001220703125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [155]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 128.0, 'gamma': 0.0001220703125} with a score of 0.65


In [156]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=128.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001220703125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [157]:
y_pred = clf.predict(X_val_combined_cw900D)

In [158]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 51.66666666666667)


In [159]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.47      0.56      0.51        27
        Vert       0.57      0.48      0.52        33

    accuracy                           0.52        60
   macro avg       0.52      0.52      0.52        60
weighted avg       0.53      0.52      0.52        60



##### Weighted SVC with 'rbf' kernel 

In [160]:
X = X_train_combined_cw900D
Y = y_train

In [161]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [162]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [163]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [164]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=128.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001220703125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [165]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 128.0, 'gamma': 0.0001220703125} with a score of 0.65


In [166]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [167]:
y_pred = clf.predict(X_val_combined_cw900D)

In [168]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 45.0)


In [169]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.45      1.00      0.62        27
        Vert       0.00      0.00      0.00        33

    accuracy                           0.45        60
   macro avg       0.23      0.50      0.31        60
weighted avg       0.20      0.45      0.28        60



  _warn_prf(average, modifier, msg_start, len(result))


##### Logistice regression

In [170]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_cw900D, y_train)
y_pred = logreg.predict(X_val_combined_cw900D)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.4666666666666667
              precision    recall  f1-score   support

         GCR       0.41      0.44      0.43        27
        Vert       0.52      0.48      0.50        33

    accuracy                           0.47        60
   macro avg       0.46      0.46      0.46        60
weighted avg       0.47      0.47      0.47        60



##### Weighted logistice regression

In [171]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_cw900D, y_train)
y_pred = logreg.predict(X_val_combined_cw900D)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.5
              precision    recall  f1-score   support

         GCR       0.45      0.48      0.46        27
        Vert       0.55      0.52      0.53        33

    accuracy                           0.50        60
   macro avg       0.50      0.50      0.50        60
weighted avg       0.50      0.50      0.50        60



#### 921D combined

##### Linear SVM

In [172]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_cw921D, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [173]:
y_pred = clf.predict(X_val_combined_cw921D)

In [174]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  50.0


In [175]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.45      0.52      0.48        27
        Vert       0.55      0.48      0.52        33

    accuracy                           0.50        60
   macro avg       0.50      0.50      0.50        60
weighted avg       0.51      0.50      0.50        60



##### Weighted Linear SVM

In [176]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_cw921D, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [177]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_cw921D)

In [178]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  50.0


In [179]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.45      0.52      0.48        27
        Vert       0.55      0.48      0.52        33

    accuracy                           0.50        60
   macro avg       0.50      0.50      0.50        60
weighted avg       0.51      0.50      0.50        60



##### SVC with 'rbf' kernel

In [180]:
X = X_train_combined_cw921D
Y = y_train

In [181]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [182]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [183]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [184]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=128.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001220703125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [185]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 128.0, 'gamma': 0.0001220703125} with a score of 0.65


In [186]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=128.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001220703125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [187]:
y_pred = clf.predict(X_val_combined_cw921D)

In [188]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 51.66666666666667)


In [189]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.47      0.56      0.51        27
        Vert       0.57      0.48      0.52        33

    accuracy                           0.52        60
   macro avg       0.52      0.52      0.52        60
weighted avg       0.53      0.52      0.52        60



##### Weighted SVC with 'rbf' kernel 

In [190]:
X = X_train_combined_cw921D
Y = y_train

In [191]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [192]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [193]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [194]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=128.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001220703125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [195]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 128.0, 'gamma': 0.0001220703125} with a score of 0.65


In [196]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [197]:
y_pred = clf.predict(X_val_combined_cw921D)

In [198]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 45.0)


In [199]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.45      1.00      0.62        27
        Vert       0.00      0.00      0.00        33

    accuracy                           0.45        60
   macro avg       0.23      0.50      0.31        60
weighted avg       0.20      0.45      0.28        60



  _warn_prf(average, modifier, msg_start, len(result))


##### Logistice regression

In [200]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_cw921D, y_train)
y_pred = logreg.predict(X_val_combined_cw921D)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.4666666666666667
              precision    recall  f1-score   support

         GCR       0.41      0.44      0.43        27
        Vert       0.52      0.48      0.50        33

    accuracy                           0.47        60
   macro avg       0.46      0.46      0.46        60
weighted avg       0.47      0.47      0.47        60



##### Weighted logistice regression

In [201]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_cw921D, y_train)
y_pred = logreg.predict(X_val_combined_cw921D)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.5
              precision    recall  f1-score   support

         GCR       0.45      0.48      0.46        27
        Vert       0.55      0.52      0.53        33

    accuracy                           0.50        60
   macro avg       0.50      0.50      0.50        60
weighted avg       0.50      0.50      0.50        60



### Training with BNC word embedding (SkipGram)

#### General form 

##### Linear SVM

In [202]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_sg, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [203]:
y_pred = clf.predict(X_val_combined_sg)

In [204]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  46.666666666666664


In [205]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.44      0.63      0.52        27
        Vert       0.52      0.33      0.41        33

    accuracy                           0.47        60
   macro avg       0.48      0.48      0.46        60
weighted avg       0.48      0.47      0.46        60



##### Weighted Linear SVM

In [206]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_sg, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [207]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_sg)

In [208]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  46.666666666666664


In [209]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.44      0.63      0.52        27
        Vert       0.52      0.33      0.41        33

    accuracy                           0.47        60
   macro avg       0.48      0.48      0.46        60
weighted avg       0.48      0.47      0.46        60



##### SVC with 'rbf' kernel

In [210]:
X = X_train_combined_sg
Y = y_train

In [211]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [212]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [213]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [214]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.03125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [215]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 2.0, 'gamma': 0.03125} with a score of 0.65


In [216]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.03125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [217]:
y_pred = clf.predict(X_val_combined_sg)

In [218]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 46.666666666666664)


In [219]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.44      0.67      0.53        27
        Vert       0.53      0.30      0.38        33

    accuracy                           0.47        60
   macro avg       0.48      0.48      0.46        60
weighted avg       0.49      0.47      0.45        60



##### Weighted SVC with 'rbf' kernel 

In [220]:
X = X_train_combined_sg
Y = y_train

In [221]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [222]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [223]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [224]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.03125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [225]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 2.0, 'gamma': 0.03125} with a score of 0.65


In [226]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [227]:
y_pred = clf.predict(X_val_combined_sg)

In [228]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 46.666666666666664)


In [229]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.44      0.70      0.54        27
        Vert       0.53      0.27      0.36        33

    accuracy                           0.47        60
   macro avg       0.49      0.49      0.45        60
weighted avg       0.49      0.47      0.44        60



##### Logistice regression

In [230]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_sg, y_train)
y_pred = logreg.predict(X_val_combined_sg)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.48333333333333334
              precision    recall  f1-score   support

         GCR       0.45      0.67      0.54        27
        Vert       0.55      0.33      0.42        33

    accuracy                           0.48        60
   macro avg       0.50      0.50      0.48        60
weighted avg       0.51      0.48      0.47        60



##### Weighted logistice regression

In [231]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_sg, y_train)
y_pred = logreg.predict(X_val_combined_sg)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.45
              precision    recall  f1-score   support

         GCR       0.42      0.63      0.51        27
        Vert       0.50      0.30      0.38        33

    accuracy                           0.45        60
   macro avg       0.46      0.47      0.44        60
weighted avg       0.47      0.45      0.44        60



#### _V

##### Linear SVM

In [232]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_V_sg, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [233]:
y_pred = clf.predict(X_val_combined_V_sg)

In [234]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  48.333333333333336


In [235]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.43      0.44      0.44        27
        Vert       0.53      0.52      0.52        33

    accuracy                           0.48        60
   macro avg       0.48      0.48      0.48        60
weighted avg       0.49      0.48      0.48        60



##### Weighted Linear SVM

In [236]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_V_sg, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [237]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_V_sg)

In [238]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  48.333333333333336


In [239]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.43      0.44      0.44        27
        Vert       0.53      0.52      0.52        33

    accuracy                           0.48        60
   macro avg       0.48      0.48      0.48        60
weighted avg       0.49      0.48      0.48        60



##### SVC with 'rbf' kernel

In [240]:
X = X_train_combined_V_sg
Y = y_train

In [241]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [242]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [243]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [244]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [245]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 2.0, 'gamma': 0.125} with a score of 0.62


In [246]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [247]:
y_pred = clf.predict(X_val_combined_V_sg)

In [248]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 41.66666666666667)


In [249]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.42      0.78      0.55        27
        Vert       0.40      0.12      0.19        33

    accuracy                           0.42        60
   macro avg       0.41      0.45      0.37        60
weighted avg       0.41      0.42      0.35        60



##### Weighted SVC with 'rbf' kernel 

In [250]:
X = X_train_combined_V_sg
Y = y_train

In [251]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [252]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [253]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [254]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [255]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 2.0, 'gamma': 0.125} with a score of 0.62


In [256]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [257]:
y_pred = clf.predict(X_val_combined_V_sg)

In [258]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 50.0)


In [259]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.45      0.52      0.48        27
        Vert       0.55      0.48      0.52        33

    accuracy                           0.50        60
   macro avg       0.50      0.50      0.50        60
weighted avg       0.51      0.50      0.50        60



##### Logistice regression

In [260]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_V_sg, y_train)
y_pred = logreg.predict(X_val_combined_V_sg)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.5
              precision    recall  f1-score   support

         GCR       0.45      0.48      0.46        27
        Vert       0.55      0.52      0.53        33

    accuracy                           0.50        60
   macro avg       0.50      0.50      0.50        60
weighted avg       0.50      0.50      0.50        60



##### Weighted logistice regression

In [261]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_V_sg, y_train)
y_pred = logreg.predict(X_val_combined_V_sg)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.5
              precision    recall  f1-score   support

         GCR       0.45      0.52      0.48        27
        Vert       0.55      0.48      0.52        33

    accuracy                           0.50        60
   macro avg       0.50      0.50      0.50        60
weighted avg       0.51      0.50      0.50        60



#### _VPC

##### Linear SVM

In [262]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_VPC_sg, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [263]:
y_pred = clf.predict(X_val_combined_VPC_sg)

In [264]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  51.66666666666667


In [265]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.48      0.78      0.59        27
        Vert       0.62      0.30      0.41        33

    accuracy                           0.52        60
   macro avg       0.55      0.54      0.50        60
weighted avg       0.56      0.52      0.49        60



##### Weighted Linear SVM

In [266]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_VPC_sg, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [267]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_VPC_sg)

In [268]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  53.333333333333336


In [269]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.49      0.74      0.59        27
        Vert       0.63      0.36      0.46        33

    accuracy                           0.53        60
   macro avg       0.56      0.55      0.52        60
weighted avg       0.57      0.53      0.52        60



##### SVC with 'rbf' kernel

In [270]:
X = X_train_combined_VPC_sg
Y = y_train

In [271]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [272]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [273]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [274]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=8.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0078125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [275]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 8.0, 'gamma': 0.0078125} with a score of 0.63


In [276]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=8.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0078125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [277]:
y_pred = clf.predict(X_val_combined_VPC_sg)

In [278]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 46.666666666666664)


In [279]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.45      0.89      0.60        27
        Vert       0.57      0.12      0.20        33

    accuracy                           0.47        60
   macro avg       0.51      0.51      0.40        60
weighted avg       0.52      0.47      0.38        60



##### Weighted SVC with 'rbf' kernel 

In [280]:
X = X_train_combined_VPC_sg
Y = y_train

In [281]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [282]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [283]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [284]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=8.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0078125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [285]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 8.0, 'gamma': 0.0078125} with a score of 0.63


In [286]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [287]:
y_pred = clf.predict(X_val_combined_VPC_sg)

In [288]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 55.00000000000001)


In [289]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.50      0.59      0.54        27
        Vert       0.61      0.52      0.56        33

    accuracy                           0.55        60
   macro avg       0.55      0.55      0.55        60
weighted avg       0.56      0.55      0.55        60



##### Logistice regression

In [290]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_VPC_sg, y_train)
y_pred = logreg.predict(X_val_combined_VPC_sg)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.55
              precision    recall  f1-score   support

         GCR       0.50      0.67      0.57        27
        Vert       0.62      0.45      0.53        33

    accuracy                           0.55        60
   macro avg       0.56      0.56      0.55        60
weighted avg       0.57      0.55      0.55        60



##### Weighted logistice regression

In [291]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_VPC_sg, y_train)
y_pred = logreg.predict(X_val_combined_VPC_sg)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.55
              precision    recall  f1-score   support

         GCR       0.50      0.67      0.57        27
        Vert       0.62      0.45      0.53        33

    accuracy                           0.55        60
   macro avg       0.56      0.56      0.55        60
weighted avg       0.57      0.55      0.55        60



#### _VPC (up)

##### Linear SVM

In [292]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_VPC_sg_up, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [293]:
y_pred = clf.predict(X_val_combined_VPC_sg_up)

In [294]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  55.00000000000001


In [295]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.50      0.70      0.58        27
        Vert       0.64      0.42      0.51        33

    accuracy                           0.55        60
   macro avg       0.57      0.56      0.55        60
weighted avg       0.57      0.55      0.54        60



##### Weighted Linear SVM

In [296]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_VPC_sg_up, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [297]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_VPC_sg_up)

In [298]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  53.333333333333336


In [299]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.49      0.70      0.58        27
        Vert       0.62      0.39      0.48        33

    accuracy                           0.53        60
   macro avg       0.55      0.55      0.53        60
weighted avg       0.56      0.53      0.52        60



##### SVC with 'rbf' kernel

In [300]:
X = X_train_combined_VPC_sg_up
Y = y_train

In [301]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [302]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [303]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [304]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=128.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.00048828125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [305]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 128.0, 'gamma': 0.00048828125} with a score of 0.67


In [306]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=128.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.00048828125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [307]:
y_pred = clf.predict(X_val_combined_VPC_sg_up)

In [308]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 56.666666666666664)


In [309]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.51      0.93      0.66        27
        Vert       0.82      0.27      0.41        33

    accuracy                           0.57        60
   macro avg       0.66      0.60      0.53        60
weighted avg       0.68      0.57      0.52        60



##### Weighted SVC with 'rbf' kernel 

In [310]:
X = X_train_combined_VPC_sg_up
Y = y_train

In [311]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [312]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [313]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [314]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=128.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.00048828125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [315]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 128.0, 'gamma': 0.00048828125} with a score of 0.67


In [316]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [317]:
y_pred = clf.predict(X_val_combined_VPC_sg_up)

In [318]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 50.0)


In [319]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.46      0.67      0.55        27
        Vert       0.57      0.36      0.44        33

    accuracy                           0.50        60
   macro avg       0.52      0.52      0.49        60
weighted avg       0.52      0.50      0.49        60



##### Logistice regression

In [320]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_VPC_sg_up, y_train)
y_pred = logreg.predict(X_val_combined_VPC_sg_up)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.5333333333333333
              precision    recall  f1-score   support

         GCR       0.49      0.67      0.56        27
        Vert       0.61      0.42      0.50        33

    accuracy                           0.53        60
   macro avg       0.55      0.55      0.53        60
weighted avg       0.55      0.53      0.53        60



##### Weighted logistice regression

In [321]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_VPC_sg_up, y_train)
y_pred = logreg.predict(X_val_combined_VPC_sg_up)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.55
              precision    recall  f1-score   support

         GCR       0.50      0.70      0.58        27
        Vert       0.64      0.42      0.51        33

    accuracy                           0.55        60
   macro avg       0.57      0.56      0.55        60
weighted avg       0.57      0.55      0.54        60



### 900D combined 

##### Linear SVM

In [322]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_sg900D, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [323]:
y_pred = clf.predict(X_val_combined_sg900D)

In [324]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  46.666666666666664


In [325]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.42      0.52      0.47        27
        Vert       0.52      0.42      0.47        33

    accuracy                           0.47        60
   macro avg       0.47      0.47      0.47        60
weighted avg       0.48      0.47      0.47        60



##### Weighted Linear SVM

In [326]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_sg900D, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [327]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_sg900D)

In [328]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  46.666666666666664


In [329]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.42      0.52      0.47        27
        Vert       0.52      0.42      0.47        33

    accuracy                           0.47        60
   macro avg       0.47      0.47      0.47        60
weighted avg       0.48      0.47      0.47        60



##### SVC with 'rbf' kernel

In [330]:
X = X_train_combined_sg900D
Y = y_train

In [331]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [332]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [333]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [334]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=32.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001953125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [335]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 32.0, 'gamma': 0.001953125} with a score of 0.65


In [336]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001953125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [337]:
y_pred = clf.predict(X_val_combined_sg900D)

In [338]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 46.666666666666664)


In [339]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.42      0.52      0.47        27
        Vert       0.52      0.42      0.47        33

    accuracy                           0.47        60
   macro avg       0.47      0.47      0.47        60
weighted avg       0.48      0.47      0.47        60



##### Weighted SVC with 'rbf' kernel 

In [340]:
X = X_train_combined_sg900D
Y = y_train

In [341]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [342]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [343]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [344]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=32.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001953125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [345]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 32.0, 'gamma': 0.001953125} with a score of 0.65


In [346]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [347]:
y_pred = clf.predict(X_val_combined_sg900D)

In [348]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 48.333333333333336)


In [349]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.44      0.52      0.47        27
        Vert       0.54      0.45      0.49        33

    accuracy                           0.48        60
   macro avg       0.49      0.49      0.48        60
weighted avg       0.49      0.48      0.48        60



##### Logistice regression

In [350]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_sg900D, y_train)
y_pred = logreg.predict(X_val_combined_sg900D)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.4666666666666667
              precision    recall  f1-score   support

         GCR       0.42      0.52      0.47        27
        Vert       0.52      0.42      0.47        33

    accuracy                           0.47        60
   macro avg       0.47      0.47      0.47        60
weighted avg       0.48      0.47      0.47        60



##### Weighted logistice regression

In [351]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_sg900D, y_train)
y_pred = logreg.predict(X_val_combined_sg900D)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.48333333333333334
              precision    recall  f1-score   support

         GCR       0.44      0.52      0.47        27
        Vert       0.54      0.45      0.49        33

    accuracy                           0.48        60
   macro avg       0.49      0.49      0.48        60
weighted avg       0.49      0.48      0.48        60



### 921D combined 

##### Linear SVM

In [352]:
clf = SVC(kernel='linear')
clf.fit(X_train_combined_sg921D, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [353]:
y_pred = clf.predict(X_val_combined_sg921D)

In [354]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  46.666666666666664


In [355]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.42      0.52      0.47        27
        Vert       0.52      0.42      0.47        33

    accuracy                           0.47        60
   macro avg       0.47      0.47      0.47        60
weighted avg       0.48      0.47      0.47        60



##### Weighted Linear SVM

In [356]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_combined_sg921D, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [357]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_combined_sg921D)

In [358]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  46.666666666666664


In [359]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.42      0.52      0.47        27
        Vert       0.52      0.42      0.47        33

    accuracy                           0.47        60
   macro avg       0.47      0.47      0.47        60
weighted avg       0.48      0.47      0.47        60



##### SVC with 'rbf' kernel

In [360]:
X = X_train_combined_sg921D
Y = y_train

In [361]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [362]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [363]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [364]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=8.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0078125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [365]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 8.0, 'gamma': 0.0078125} with a score of 0.65


In [366]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=8.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0078125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [367]:
y_pred = clf.predict(X_val_combined_sg921D)

In [368]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 48.333333333333336)


In [369]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.44      0.52      0.47        27
        Vert       0.54      0.45      0.49        33

    accuracy                           0.48        60
   macro avg       0.49      0.49      0.48        60
weighted avg       0.49      0.48      0.48        60



##### Weighted SVC with 'rbf' kernel 

In [370]:
X = X_train_combined_sg921D
Y = y_train

In [371]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [372]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [373]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [374]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=8.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0078125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [375]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 8.0, 'gamma': 0.0078125} with a score of 0.65


In [376]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [377]:
y_pred = clf.predict(X_val_combined_sg921D)

In [378]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 48.333333333333336)


In [379]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.44      0.56      0.49        27
        Vert       0.54      0.42      0.47        33

    accuracy                           0.48        60
   macro avg       0.49      0.49      0.48        60
weighted avg       0.49      0.48      0.48        60



##### Logistice regression

In [380]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_combined_sg921D, y_train)
y_pred = logreg.predict(X_val_combined_sg921D)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.4666666666666667
              precision    recall  f1-score   support

         GCR       0.42      0.52      0.47        27
        Vert       0.52      0.42      0.47        33

    accuracy                           0.47        60
   macro avg       0.47      0.47      0.47        60
weighted avg       0.48      0.47      0.47        60



##### Weighted logistice regression

In [381]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_combined_sg921D, y_train)
y_pred = logreg.predict(X_val_combined_sg921D)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.5
              precision    recall  f1-score   support

         GCR       0.45      0.52      0.48        27
        Vert       0.55      0.48      0.52        33

    accuracy                           0.50        60
   macro avg       0.50      0.50      0.50        60
weighted avg       0.51      0.50      0.50        60



### training with google pre-trained word embedding 

In [382]:
# def transform_with_google(x):
#     return np.array([
#         np.mean([model_google[w]
#                  for w in words if w in model_google] or [np.zeros(model_google['balance'].shape)],
#                 axis=0) for words in x
#     ])
def transform_with_google(x):
    return np.array([
        np.concatenate(([model_google[w]
                 for w in words if w in model_google]or [np.zeros(model_google['balance'].shape)],
                       [particle_features[w]
                 for w in words if w in particle_features]or [np.zeros(particle_features['balance'].shape)]),
                axis=None) for words in x
    ]) 
# X_train_combined_cw=transform_with_combine_features_cw(X_train)
# X_val_combined_cw=transform_with_combine_features_cw(X_val)

In [383]:
X_train_updated_google = transform_with_google(X_train)
# X_val_updated_google = transform_with_google(X_val)
X_val_updated_google = transform_with_google(X_test)

#### Linear SVM

In [384]:
clf = SVC(kernel='linear')
clf.fit(X_train_updated_google, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [385]:
y_pred = clf.predict(X_val_updated_google)

In [386]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  41.66666666666667


In [387]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.39      0.52      0.44        27
        Vert       0.46      0.33      0.39        33

    accuracy                           0.42        60
   macro avg       0.42      0.43      0.42        60
weighted avg       0.43      0.42      0.41        60



#### Weighted Linear SVM

In [388]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_updated_google, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [389]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_updated_google)

In [390]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  41.66666666666667


In [391]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.39      0.52      0.44        27
        Vert       0.46      0.33      0.39        33

    accuracy                           0.42        60
   macro avg       0.42      0.43      0.42        60
weighted avg       0.43      0.42      0.41        60



#### SVC with 'rbf' kernel

In [392]:
X = X_train_updated_google
Y = y_train

In [393]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [394]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [395]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [396]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=32.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001953125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [397]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 32.0, 'gamma': 0.001953125} with a score of 0.68


In [398]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001953125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [399]:
y_pred = clf.predict(X_val_updated_google)

In [400]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 36.666666666666664)


In [401]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.39      0.70      0.50        27
        Vert       0.27      0.09      0.14        33

    accuracy                           0.37        60
   macro avg       0.33      0.40      0.32        60
weighted avg       0.32      0.37      0.30        60



#### Weighted SVC with 'rbf' kernel 

In [402]:
X = X_train_updated_google
Y = y_train

In [403]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [404]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [405]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [406]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=32.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001953125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [407]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 32.0, 'gamma': 0.001953125} with a score of 0.68


In [408]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [409]:
y_pred = clf.predict(X_val_updated_google)

In [410]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 38.333333333333336)


In [411]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.38      0.56      0.45        27
        Vert       0.40      0.24      0.30        33

    accuracy                           0.38        60
   macro avg       0.39      0.40      0.37        60
weighted avg       0.39      0.38      0.37        60



#### Logistice regression

In [412]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_updated_google, y_train)
y_pred = logreg.predict(X_val_updated_google)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.38333333333333336
              precision    recall  f1-score   support

         GCR       0.37      0.52      0.43        27
        Vert       0.41      0.27      0.33        33

    accuracy                           0.38        60
   macro avg       0.39      0.40      0.38        60
weighted avg       0.39      0.38      0.37        60



#### Weighted logistice regression

In [413]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_updated_google, y_train)
y_pred = logreg.predict(X_val_updated_google)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.35
              precision    recall  f1-score   support

         GCR       0.34      0.48      0.40        27
        Vert       0.36      0.24      0.29        33

    accuracy                           0.35        60
   macro avg       0.35      0.36      0.35        60
weighted avg       0.35      0.35      0.34        60



### training with glove_small (50D) pre-trained word embedding 

In [414]:
def transform_with_glove_small(x):
    return np.array([
        np.mean([glove_small[w]
                 for w in words if w in glove_small] or [np.zeros(glove_small['balance'].shape)],
                axis=0) for words in x
    ])

In [415]:
X_train_updated_glove_small = transform_with_glove_small(X_train)
# X_val_updated_glove_small = transform_with_glove_small(X_val)
X_val_updated_glove_small = transform_with_glove_small(X_test)

#### Linear SVM

In [416]:
clf = SVC(kernel='linear')
clf.fit(X_train_updated_glove_small, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [417]:
y_pred = clf.predict(X_val_updated_glove_small)

In [418]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  53.333333333333336


In [419]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.49      0.74      0.59        27
        Vert       0.63      0.36      0.46        33

    accuracy                           0.53        60
   macro avg       0.56      0.55      0.52        60
weighted avg       0.57      0.53      0.52        60



#### Weighted Linear SVM

In [420]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_updated_glove_small, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [421]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_updated_glove_small)

In [422]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  53.333333333333336


In [423]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.49      0.74      0.59        27
        Vert       0.63      0.36      0.46        33

    accuracy                           0.53        60
   macro avg       0.56      0.55      0.52        60
weighted avg       0.57      0.53      0.52        60



#### SVC with 'rbf' kernel

In [424]:
X = X_train_updated_glove_small
Y = y_train

In [425]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [426]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [427]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [428]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [429]:
y_pred = clf.predict(X_val_updated_glove_small)

In [430]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 43.333333333333336)


In [431]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.44      0.96      0.60        27
        Vert       0.00      0.00      0.00        33

    accuracy                           0.43        60
   macro avg       0.22      0.48      0.30        60
weighted avg       0.20      0.43      0.27        60



#### Weighted SVC with 'rbf' kernel 

In [432]:
X = X_train_updated_glove_small
Y = y_train

In [433]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_glove_small)

In [434]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [435]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [436]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [437]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 2.0, 'gamma': 0.5} with a score of 0.65


In [438]:
clf = SVC(C=512.0, class_weight='balanced', gamma=0.00048828125, kernel='rbf')
clf.fit(X, Y)

SVC(C=512.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.00048828125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [439]:
y_pred = clf.predict(X_val_updated_glove_small)

In [440]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 60.0)


In [441]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.54      0.70      0.61        27
        Vert       0.68      0.52      0.59        33

    accuracy                           0.60        60
   macro avg       0.61      0.61      0.60        60
weighted avg       0.62      0.60      0.60        60



#### Logistice regression

In [442]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_updated_glove_small, y_train)
y_pred = logreg.predict(X_val_updated_glove_small)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.4666666666666667
              precision    recall  f1-score   support

         GCR       0.44      0.63      0.52        27
        Vert       0.52      0.33      0.41        33

    accuracy                           0.47        60
   macro avg       0.48      0.48      0.46        60
weighted avg       0.48      0.47      0.46        60



#### Weighted logistice regression

In [443]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_updated_glove_small, y_train)
y_pred = logreg.predict(X_val_updated_glove_small)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.4666666666666667
              precision    recall  f1-score   support

         GCR       0.44      0.63      0.52        27
        Vert       0.52      0.33      0.41        33

    accuracy                           0.47        60
   macro avg       0.48      0.48      0.46        60
weighted avg       0.48      0.47      0.46        60



### training with glove_big (300D) pre-trained word embedding 

In [444]:
# def transform_with_glove_big(x):
#     return np.array([
#         np.mean([glove_big[w]
#                  for w in words if w in glove_big] or [np.zeros(glove_big['balance'].shape)],
#                 axis=0) for words in x
#     ])
def transform_with_glove_big(x):
    return np.array([
        np.concatenate(([glove_big[w]
                 for w in words if w in glove_big]or [np.zeros(glove_big['balance'].shape)],
                       [particle_features[w]
                 for w in words if w in particle_features]or [np.zeros(particle_features['balance'].shape)]),
                axis=None) for words in x
    ]) 

In [445]:
X_train_updated_glove_big = transform_with_glove_big(X_train)
# X_val_updated_glove_big = transform_with_glove_big(X_val)
X_val_updated_glove_big = transform_with_glove_big(X_test)

#### Linear SVM

In [446]:
clf = SVC(kernel='linear')
clf.fit(X_train_updated_glove_big, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [447]:
y_pred = clf.predict(X_val_updated_glove_big)

In [448]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  41.66666666666667


In [449]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.37      0.41      0.39        27
        Vert       0.47      0.42      0.44        33

    accuracy                           0.42        60
   macro avg       0.42      0.42      0.42        60
weighted avg       0.42      0.42      0.42        60



#### Weighted Linear SVM

In [450]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_updated_glove_big, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [451]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_updated_glove_big)

In [452]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  41.66666666666667


In [453]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.37      0.41      0.39        27
        Vert       0.47      0.42      0.44        33

    accuracy                           0.42        60
   macro avg       0.42      0.42      0.42        60
weighted avg       0.42      0.42      0.42        60



#### SVC with 'rbf' kernel

In [454]:
X = X_train_updated_glove_big
Y = y_train

In [455]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [456]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [457]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.03125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [458]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.03125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [459]:
y_pred = clf.predict(X_val_updated_glove_big)

In [460]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 38.333333333333336)


In [461]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.39      0.67      0.49        27
        Vert       0.36      0.15      0.21        33

    accuracy                           0.38        60
   macro avg       0.37      0.41      0.35        60
weighted avg       0.37      0.38      0.34        60



#### Weighted SVC with 'rbf' kernel 

In [462]:
X = X_train_updated_glove_big
Y = y_train

In [463]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X = X_train_updated_glove_big)

In [464]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [465]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [466]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=2.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.03125, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [467]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 2.0, 'gamma': 0.03125} with a score of 0.67


In [468]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.001953125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.001953125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [469]:
y_pred = clf.predict(X_val_updated_glove_big)

In [470]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 43.333333333333336)


In [471]:
y_val

array(['GCR', 'GCR', 'GCR', 'Vert', 'Vert', 'Vert', 'GCR', 'GCR', 'Vert',
       'Vert', 'GCR', 'Vert', 'GCR', 'GCR', 'Vert', 'GCR', 'Vert', 'Vert',
       'GCR', 'Vert', 'GCR', 'Vert', 'Vert', 'Vert', 'Vert', 'Vert',
       'Vert', 'Vert', 'Vert', 'Vert', 'Vert', 'GCR', 'GCR', 'Vert',
       'GCR', 'GCR', 'Vert', 'GCR', 'Vert', 'Vert', 'GCR', 'GCR', 'Vert',
       'GCR', 'GCR', 'Vert', 'GCR', 'GCR', 'GCR', 'Vert', 'Vert', 'Vert',
       'GCR', 'Vert', 'Vert', 'GCR', 'Vert', 'Vert', 'GCR', 'GCR'],
      dtype='<U5')

In [472]:
y_pred

array(['Vert', 'Vert', 'GCR', 'GCR', 'GCR', 'Vert', 'Vert', 'GCR', 'Vert',
       'GCR', 'Vert', 'GCR', 'Vert', 'Vert', 'GCR', 'GCR', 'GCR', 'Vert',
       'Vert', 'GCR', 'GCR', 'GCR', 'GCR', 'GCR', 'GCR', 'GCR', 'Vert',
       'Vert', 'Vert', 'GCR', 'Vert', 'GCR', 'GCR', 'Vert', 'Vert', 'GCR',
       'GCR', 'Vert', 'Vert', 'Vert', 'GCR', 'Vert', 'GCR', 'Vert',
       'Vert', 'GCR', 'GCR', 'Vert', 'GCR', 'Vert', 'Vert', 'Vert', 'GCR',
       'GCR', 'Vert', 'Vert', 'Vert', 'GCR', 'Vert', 'Vert'], dtype='<U5')

In [473]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.38      0.41      0.39        27
        Vert       0.48      0.45      0.47        33

    accuracy                           0.43        60
   macro avg       0.43      0.43      0.43        60
weighted avg       0.44      0.43      0.43        60



#### Logistice regression

In [474]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_updated_glove_big, y_train)
y_pred = logreg.predict(X_val_updated_glove_big)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.43333333333333335
              precision    recall  f1-score   support

         GCR       0.38      0.41      0.39        27
        Vert       0.48      0.45      0.47        33

    accuracy                           0.43        60
   macro avg       0.43      0.43      0.43        60
weighted avg       0.44      0.43      0.43        60



#### Weighted logistice regression

In [475]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_updated_glove_big, y_train)
y_pred = logreg.predict(X_val_updated_glove_big)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.45
              precision    recall  f1-score   support

         GCR       0.41      0.52      0.46        27
        Vert       0.50      0.39      0.44        33

    accuracy                           0.45        60
   macro avg       0.46      0.46      0.45        60
weighted avg       0.46      0.45      0.45        60



### Particle features 

In [476]:
def transform_with_particle_features(x):
    return np.array([
        np.mean([particle_features[w]
                 for w in words if w in particle_features]or [np.zeros(particle_features['balance'].shape)],
                axis=0) for words in x
    ])

In [477]:
combined={}
for k in particle_features.keys():
    modified_key= str(k)+'_V'
    combined[modified_key]=particle_features[k]

In [478]:
def transform_with_combine_features(x):
    return np.array([
        np.concatenate(([model_BNC_tagged_sentences_verb_sg[w]
                 for w in words if w in model_BNC_tagged_sentences_verb_sg]or [np.zeros(model_BNC_tagged_sentences_verb_sg['balance_V'].shape)],
                       [combined[w]
                 for w in words if w in combined]or [np.zeros(combined['balance_V'].shape)]),
                axis=None) for words in x
    ]) 
X_train_combined=transform_with_combine_features(X_train)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  import sys


In [479]:
X_train_particle_features = transform_with_particle_features(X_train)
# X_val_particle_features = transform_with_particle_features(X_val)
X_val_particle_features = transform_with_particle_features(X_test)

##### Linear SVM

In [480]:
clf = SVC(kernel='linear')
clf.fit(X_train_particle_features, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [481]:
y_pred = clf.predict(X_val_particle_features)

In [482]:
print("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100)

SVM Accuracy Score ->  45.0


In [483]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred, target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.45      1.00      0.62        27
        Vert       0.00      0.00      0.00        33

    accuracy                           0.45        60
   macro avg       0.23      0.50      0.31        60
weighted avg       0.20      0.45      0.28        60



  _warn_prf(average, modifier, msg_start, len(result))


##### Weighted Linear SVM

In [484]:
clf_balanced_class_weight = SVC(kernel='linear', class_weight='balanced')
clf_balanced_class_weight.fit(X_train_particle_features, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [485]:
y_pred_balanced_class_weight = clf_balanced_class_weight.predict(
    X_val_particle_features)

In [486]:
print("SVM Accuracy Score -> ",
      accuracy_score(y_pred_balanced_class_weight, y_val) * 100)

SVM Accuracy Score ->  56.666666666666664


In [487]:
from sklearn.metrics import classification_report
print(
    classification_report(y_val,
                          y_pred_balanced_class_weight,
                          target_names=my_tags))

              precision    recall  f1-score   support

         GCR       0.53      0.30      0.38        27
        Vert       0.58      0.79      0.67        33

    accuracy                           0.57        60
   macro avg       0.56      0.54      0.52        60
weighted avg       0.56      0.57      0.54        60



##### SVC with 'rbf' kernel

In [488]:
X = X_train_particle_features
Y = y_train

In [489]:
X_train_particle_features

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.47540984, 0.        , 0.03278688, ..., 0.        , 0.        ,
        0.        ],
       [0.00479233, 0.        , 0.        , ..., 0.00906344, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.03209243, 0.01558775, 0.05226481, ..., 0.03063063, 0.01801802,
        0.01081081],
       [0.01234568, 0.03703704, 0.03703704, ..., 0.01724138, 0.01724138,
        0.        ]], dtype=float32)

In [490]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [491]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [492]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [493]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=32.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=2.0, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [494]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 32.0, 'gamma': 2.0} with a score of 0.73


In [495]:
clf = grid.best_estimator_
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=2.0, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [496]:
y_pred = clf.predict(X_val_particle_features)

In [497]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 53.333333333333336)


In [498]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.48      0.56      0.52        27
        Vert       0.59      0.52      0.55        33

    accuracy                           0.53        60
   macro avg       0.54      0.54      0.53        60
weighted avg       0.54      0.53      0.53        60



##### Weighted SVC with 'rbf' kernel 

In [499]:
X = X_train_particle_features
Y = y_train

In [500]:
# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the trainingset and
# just applying it on the test set.

# scaler = StandardScaler()
# X = scaler.fit_transform(X_train_updated_google)

In [501]:
C_range = 2. ** np.arange(-5, 18, 2)
gamma_range = 2. ** np.arange(-17, 4, 2)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid,
                    cv=StratifiedKFold(n_splits=10))
grid.fit(X, Y)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs...
       3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
       3.27680e+04, 1.31072e+05]),
                         'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
       1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
       5.00000000e-01, 2.00000000e+00, 8.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=

In [502]:
param_grid

{'gamma': array([7.62939453e-06, 3.05175781e-05, 1.22070312e-04, 4.88281250e-04,
        1.95312500e-03, 7.81250000e-03, 3.12500000e-02, 1.25000000e-01,
        5.00000000e-01, 2.00000000e+00, 8.00000000e+00]),
 'C': array([3.12500e-02, 1.25000e-01, 5.00000e-01, 2.00000e+00, 8.00000e+00,
        3.20000e+01, 1.28000e+02, 5.12000e+02, 2.04800e+03, 8.19200e+03,
        3.27680e+04, 1.31072e+05])}

In [503]:
print("The best classifier is: ", grid.best_estimator_)

The best classifier is:  SVC(C=32.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=2.0, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [504]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 32.0, 'gamma': 2.0} with a score of 0.73


In [505]:
clf = SVC(C=32.0, class_weight='balanced', gamma=0.03125, kernel='rbf')
clf.fit(X, Y)

SVC(C=32.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.03125,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [506]:
y_pred = clf.predict(X_val_particle_features)

In [507]:
print(("SVM Accuracy Score -> ", accuracy_score(y_pred, y_val) * 100))

('SVM Accuracy Score -> ', 53.333333333333336)


In [508]:
y_val

array(['GCR', 'GCR', 'GCR', 'Vert', 'Vert', 'Vert', 'GCR', 'GCR', 'Vert',
       'Vert', 'GCR', 'Vert', 'GCR', 'GCR', 'Vert', 'GCR', 'Vert', 'Vert',
       'GCR', 'Vert', 'GCR', 'Vert', 'Vert', 'Vert', 'Vert', 'Vert',
       'Vert', 'Vert', 'Vert', 'Vert', 'Vert', 'GCR', 'GCR', 'Vert',
       'GCR', 'GCR', 'Vert', 'GCR', 'Vert', 'Vert', 'GCR', 'GCR', 'Vert',
       'GCR', 'GCR', 'Vert', 'GCR', 'GCR', 'GCR', 'Vert', 'Vert', 'Vert',
       'GCR', 'Vert', 'Vert', 'GCR', 'Vert', 'Vert', 'GCR', 'GCR'],
      dtype='<U5')

In [509]:
y_pred

array(['Vert', 'Vert', 'Vert', 'Vert', 'Vert', 'Vert', 'GCR', 'GCR',
       'Vert', 'Vert', 'Vert', 'Vert', 'GCR', 'GCR', 'Vert', 'Vert',
       'Vert', 'Vert', 'Vert', 'GCR', 'Vert', 'GCR', 'GCR', 'Vert',
       'Vert', 'Vert', 'Vert', 'GCR', 'GCR', 'Vert', 'Vert', 'GCR', 'GCR',
       'Vert', 'Vert', 'Vert', 'Vert', 'Vert', 'Vert', 'GCR', 'Vert',
       'Vert', 'Vert', 'GCR', 'Vert', 'GCR', 'Vert', 'Vert', 'Vert',
       'Vert', 'GCR', 'GCR', 'GCR', 'GCR', 'Vert', 'Vert', 'Vert', 'Vert',
       'Vert', 'GCR'], dtype='<U5')

In [510]:
from sklearn.metrics import classification_report
print((classification_report(y_val, y_pred, target_names=my_tags)))

              precision    recall  f1-score   support

         GCR       0.47      0.33      0.39        27
        Vert       0.56      0.70      0.62        33

    accuracy                           0.53        60
   macro avg       0.52      0.52      0.51        60
weighted avg       0.52      0.53      0.52        60



##### Logistice regression

In [511]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_particle_features, y_train)
y_pred = logreg.predict(X_val_particle_features)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.6333333333333333
              precision    recall  f1-score   support

         GCR       0.57      0.74      0.65        27
        Vert       0.72      0.55      0.62        33

    accuracy                           0.63        60
   macro avg       0.65      0.64      0.63        60
weighted avg       0.65      0.63      0.63        60



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


##### Weighted logistice regression

In [512]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=1, C=1e5, class_weight='balanced')
logreg = logreg.fit(X_train_particle_features, y_train)
y_pred = logreg.predict(X_val_particle_features)
print('accuracy %s' % accuracy_score(y_pred, y_val))
print(classification_report(y_val, y_pred, target_names=my_tags))

accuracy 0.65
              precision    recall  f1-score   support

         GCR       0.59      0.74      0.66        27
        Vert       0.73      0.58      0.64        33

    accuracy                           0.65        60
   macro avg       0.66      0.66      0.65        60
weighted avg       0.67      0.65      0.65        60



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
