In [1]:
from sklearn.metrics import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
import random

## Loading Data

In [4]:
import pandas as pd

In [5]:
DATA_FOLDER = "Data_Processed/Shared_Task_eng/"

In [6]:
df_train = pd.read_csv(DATA_FOLDER+"train_1.csv")
df_val = pd.read_csv(DATA_FOLDER+"val_1.csv")
df_test = pd.read_csv(DATA_FOLDER+"test_1.csv")

In [7]:
df_train.dropna(inplace=True)
df_val.dropna(inplace=True)
df_test.dropna(inplace=True)

In [44]:
len(df_train)

9161

## Word Embeddings
- https://towardsdatascience.com/nlp-embedding-techniques-51b7e6ec9f92

In [45]:
X_train = df_train['Text'].values
Y_train = df_train['Label'].values
X_val = df_val['Text'].values
Y_val = df_val['Label'].values
X_test = df_test['Text'].values
Y_test = df_test['Label'].values

In [116]:
X_train.shape,X_val.shape,X_test.shape

((9161,), (1308,), (2615,))

### Bag of Words

In [10]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)

CountVectorizer()

In [13]:
vector = vectorizer.transform(X_train)

In [14]:
vector.shape

(9161, 21489)

In [108]:
def get_bow_embedding(train,val,test):
    vectorizer = CountVectorizer()
    vectorizer.fit(train)
    
    x_train = vectorizer.transform(train)
    x_val = vectorizer.transform(val)
    x_test = vectorizer.transform(test)
    
    return x_train,x_val,x_test

In [109]:
x_train,x_val,x_test = get_bow_embedding(X_train,X_val,X_test)

In [111]:
x_train.shape,x_val.shape,x_test.shape

((9161, 21489), (1308, 21489), (2615, 21489))

### Tf-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
vectorizer_tfidf = TfidfVectorizer()

In [19]:
vectorizer_tfidf.fit(X_train)

TfidfVectorizer()

In [20]:
vector_tfidf = vectorizer_tfidf.transform(X_train)

In [21]:
vector_tfidf.shape

(9161, 21489)

In [112]:
def get_tfidf_embedding(train,val,test):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(train)
    
    x_train = vectorizer.transform(train)
    x_val = vectorizer.transform(val)
    x_test = vectorizer.transform(test)
    
    return x_train,x_val,x_test

In [113]:
x_train,x_val,x_test = get_tfidf_embedding(X_train,X_val,X_test)

In [114]:
x_train.shape,x_val.shape,x_test.shape

((9161, 21489), (1308, 21489), (2615, 21489))

### Google's Idea
- Ref: https://developers.google.com/machine-learning/guides/text-classification/step-3
- Convert Words into uni-grams and bi-grams and calculate all features using tf-idf and then select top 20K features using f_classif or chi_2
- This article says that normalisation does not help text datasets much so it is not used here

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [28]:
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

In [40]:
kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'float32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }

In [41]:
def ngram_vectorise(kwargs,train_text,train_labels,val_text,test_text):
    vectorizer = TfidfVectorizer(**kwargs)
    
    x_train = vectorizer.fit_transform(train_text)
    
    x_val = vectorizer.transform(val_text)
    x_test = vectorizer.transform(test_text)
    
    print("Features Calculated: ",x_train.shape[1])
    
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    x_test = selector.transform(x_test).astype('float32')
    
    return x_train,x_val,x_test

In [50]:
x_train,x_val,x_test = ngram_vectorise(kwargs,X_train,Y_train,X_val,X_test)



Features Calculated:  52985


In [51]:
x_train.shape,x_val.shape,x_test.shape

((9161, 20000), (1308, 20000), (2615, 20000))

### Word2Vec
- Ref: https://radimrehurek.com/gensim/models/word2vec.html

In [72]:
from gensim.models import Word2Vec
import gensim.downloader as api

In [73]:
wv = api.load('word2vec-google-news-300')

In [76]:
wv['gold'].shape

(300,)

In [117]:
def encode_w2v(wv,data):
    new_data=[]
    for sentence in data:
        sample=[]
        for word in sentence:
            if word in wv:
                sample.append(wv[word])
            else:
                sample.append(np.zeros(shape=(300)))
        new_data.append(np.mean(np.array(sample),axis=0))
    return np.array(new_data)

In [101]:
x_train_word2vec = get_word2vec_embeddings(wv,X_train)

In [102]:
x_train_word2vec.shape

(9161, 300)

In [118]:
def get_w2v_embedding(wv,train,val,test):
    x_train = encode_w2v(wv,train)
    x_val = encode_w2v(wv,val)
    x_test = encode_w2v(wv,test)
    
    return x_test,x_val,x_test

In [120]:
x_train,x_val,x_test = get_w2v_embedding(wv,X_train,X_val,X_test)

In [121]:
x_train.shape,x_val.shape,x_test.shape

((2615, 300), (1308, 300), (2615, 300))

### Doc2Vec
- https://radimrehurek.com/gensim/models/doc2vec.html

In [61]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

In [53]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train)]

In [54]:
model = Doc2Vec(documents, vector_size=300, window=5, min_count=1, workers=4)

In [122]:
def encode_d2v(model,data):
    embed = [list(model.infer_vector(ele.split('.'))) for ele in data]
    return np.array(embed)

In [127]:
def get_d2v_embedding(model,train,val,test):
    x_train = encode_d2v(model,train)
    x_val = encode_d2v(model,val)
    x_test = encode_d2v(model,test)
    
    return x_train,x_val,x_test

In [128]:
x_train,x_val,x_test = get_d2v_embedding(model,X_train,X_val,X_test)

In [129]:
x_train.shape,x_val.shape,x_test.shape

((9161, 300), (1308, 300), (2615, 300))

## Models
- Logistic Regression
- SVM
- KNN
- Gaussian NB
- Decision Tree
- Random Forest
- XGBoost
- AdaBoost

In [136]:
!pip install xgboost



In [137]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier

In [138]:
def evalMetric(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    mf1Score = f1_score(y_true, y_pred, average='macro')
    f1Score  = f1_score(y_true, y_pred, labels = np.unique(y_pred))
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    area_under_c = auc(fpr, tpr)
    recallScore = recall_score(y_true, y_pred, labels = np.unique(y_pred))
    precisionScore = precision_score(y_true, y_pred, labels = np.unique(y_pred))
    return dict({"accuracy": accuracy, 'mF1Score': mf1Score, 
                    'f1Score': f1Score, 'precision': precisionScore, 
                    'recall': recallScore})

In [145]:
weights = {0:1,1:8}

### SVM

In [146]:
SVM = SVC(kernel="rbf",class_weight=weights,probability=True)
SVM.fit(x_train,Y_train)
y_pred = SVM.predict(x_val)

In [147]:
evalMetric(Y_val,y_pred)

{'accuracy': 0.7224770642201835,
 'mF1Score': 0.5171100070683597,
 'f1Score': 0.20219780219780223,
 'precision': 0.1393939393939394,
 'recall': 0.368}

In [176]:
DT = DecisionTreeClassifier(class_weight=weights, 
            criterion='gini', max_depth=100, max_features=1.0, 
            max_leaf_nodes=10, min_impurity_split=1e-07, 
            min_samples_leaf=1, min_samples_split=2, 
            min_weight_fraction_leaf=0.10, presort=False, 
            random_state=42, splitter='best')

In [177]:
SVM = SVC(kernel="rbf",class_weight=weights,probability=True)

In [178]:
RF = RandomForestClassifier(n_estimators=1000,random_state=0,
                    n_jobs=1000,max_depth=100,bootstrap=True,
                      class_weight=weights)

In [179]:
AB =AdaBoostClassifier()

In [180]:
KNN = KNeighborsClassifier(leaf_size=1,p=2,n_neighbors=20)

In [181]:
GNB = GaussianNB()

In [182]:
LR = LogisticRegression(C=1.0, class_weight=weights, dual=False, 
        fit_intercept=True, intercept_scaling=1, max_iter=100, 
        multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, 
        solver='liblinear', tol=0.0001,verbose=0, warm_start=False)

In [183]:
XGBC = XGBClassifier(learning_rate =0.1, n_estimators=100000, 
            max_depth=6, min_child_weight=6, gamma=0, subsample=0.6, 
            colsample_bytree=0.8, reg_alpha=0.005, 
            objective= 'binary:logistic', nthread=2, 
            scale_pos_weight=1, seed=42, class_weight=weights)

In [184]:
classifiers=[SVM,XGBC,RF,DT,AB,KNN,GNB,LR]

In [185]:
names = ['SVC','XGBoost','Random Forest','Decision Tree','AdaBoost',
        'KNN','Gausian NB','Logistic Regression']

In [186]:
res={}

In [174]:
for i,cf in enumerate(classifiers):
    cf.fit(x_train,Y_train)
    y_pred = cf.predict(x_val)
    metric = evalMetric(Y_val,y_pred)
    res[names[i]]=metric



Parameters: { "class_weight" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




AttributeError: 'tuple' object has no attribute 'fit'

In [175]:
res

{'SVC': {'accuracy': 0.7224770642201835,
  'mF1Score': 0.5171100070683597,
  'f1Score': 0.20219780219780223,
  'precision': 0.1393939393939394,
  'recall': 0.368},
 'XGBoost': {'accuracy': 0.9197247706422018,
  'mF1Score': 0.626370098726002,
  'f1Score': 0.29530201342281875,
  'precision': 0.9166666666666666,
  'recall': 0.176},
 'Random Forest': {'accuracy': 0.9197247706422018,
  'mF1Score': 0.626370098726002,
  'f1Score': 0.29530201342281875,
  'precision': 0.9166666666666666,
  'recall': 0.176}}

In [188]:
!nvidia-smi

Mon Aug  2 19:40:11 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:61:00.0 Off |                    0 |
| N/A   61C    P0   153W / 250W |  11434MiB / 12198MiB |    100%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  Off  | 00000000:DB:00.0 Off |                    0 |
| N/A   67C    P0   197W / 250W |  15666MiB / 16280MiB |    100%      Defaul