In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset 
from sklearn.metrics import confusion_matrix, accuracy_score

import csv
import re
import string
from collections import defaultdict


## 0. Load and inspect the data

In [None]:
url_train = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTOZ2rC82rhNsJduoyKYTsVeH6ukd7Bpxvxn_afOibn3R-eadZGXu82eCU9IRpl4CK_gefEGsYrA_oM/pub?gid=1863430984&single=true&output=tsv'
url_test = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vT-KNR9nuYatLkSbzSRgpz6Ku1n4TN4w6kKmFLkA6QJHTfQzmX0puBsLF7PAAQJQAxUpgruDd_RRgK7/pub?gid=417546901&single=true&output=tsv'

In [None]:
from io import StringIO
import requests

def load_dataset(url):
    r = requests.get(url)
    data = r.content.decode('utf8')
    df = pd.read_csv(StringIO(data), sep='\t')
    df.columns = ['tweet', 'label']
    return df

In [None]:
df_train = load_dataset(url_train)
df_test = load_dataset(url_test)

In [None]:
print(df_train.shape)
print(df_test.shape)
print("num of language in train set: ", len(np.unique(df_train['label'])))

(52675, 2)
(13279, 2)
num of language in train set:  69


In [None]:
df_train.head

<bound method NDFrame.head of                                                    tweet label
0      يا من أناديها ويخنقني البكاء  ويكاد صمت الدمع ...    ar
1      فيه فرق بين اهل غزة اللى مطحونين من ناحيتين وب...    ar
2      ﻋﻦ ﺍﻟﻠﺤﻈﺔ اﻟﺤﻠﻮﺓﺓ ﺍﻟﻠﻲ ﺑﺘﻐﻤﺾ ﻓﻴﻬﺎ ﻋﻴﻨﻴﻚ ﺑﺘﻔﻜﺮ ...    ar
3                                      يا ابو سلو عرفتني    ar
4      ب50 ريال أكفل معتمر في رمضان ، ولك بإذن الله م...    ar
...                                                  ...   ...
42093                        그 내가 드래곤 라이딩에 대한 소설 읽어본적 있음    ko
42094              우주를 멸망시킬뻔한사람과 이기도록 백업을받는사람을 상대로 어떻게이겨    ko
42095                        다메다욬ㅋㅋㅋㅋㅋ그랫다간 배고파스주글거같은(파스스    ko
42096                           in_saeng_owo 아뇨 제가 경험한일임    ko
42097                                          osy9611 넵    ko

[42041 rows x 2 columns]>

## 1. Data preprocessing 
This includes balancing the distribution of dataset, matching labels in both training and test dataset, normaliaing and splitting the data into training, validation, test dataset 

In [None]:
from sklearn.model_selection import train_test_split

class Preprocessor:
  def __init__(self, df_train, df_test):
    self.df_train = df_train
    self.df_test = df_test
    self.df_train_bal = None
    self.df_test_matched = None
  
  def balancing(self):
    n_labels = self.df_train['label'].value_counts()
    rare_labels = [l for l in n_labels[n_labels <= 10].index]
    self.df_train_bal = self.df_train[df_train.label.isin(rare_labels) == False]

  def match_labels(self):
    labels_train = self.df_train.label.unique()
    self.df_test_matched = df_test[self.df_test.label.isin(labels_train) == True]

  def normalize(self, df_data):
    texts = list(df_data['tweet'])
    norm_texts = []
    for text in texts:
      text = text.lower()
      text = re.sub(r'\n', '', text)
      text = re.sub(r'(\s\d+)','',text) 
      text = re.sub(r'([.,!?@#])', '', text)
      norm_texts.append(text)  
    df_data.loc[:,'tweet'] = norm_texts
    return df_data

  def split_train(self, ratio):
    df_train, df_val = train_test_split(self.df_train, test_size=ratio)
    return df_train, df_val

  def preprocess(self):
    self.balancing()
    self.match_labels()
    self.df_train = self.normalize(self.df_train_bal)
    self.df_test = self.normalize(self.df_test_matched)
    df_train, df_val = self.split_train(0.8)
    return df_train, df_val, self.df_test


In [None]:
preprocessor = Preprocessor(df_train, df_test)
df_train, df_val, df_test = preprocessor.preprocess()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


## 2. Vectorize the tweet text & Label encoding

We built a feature vector by counting the 100 most frequently appearing bigram in given text

In [None]:
from nltk import ngrams
from sklearn.preprocessing import LabelEncoder

class Vectorize_and_Encoding:
  def __init__(self, df_train, df_val, df_test):
    self.df_total = None
    self.df_train = df_train
    self.df_val = df_val
    self.df_test = df_test
    self.bigram_by_lan = {}
    self.sorted_unique_bigrams = []
    self.num_features = None
    self.bigram_to_idx = None

  def create_bigrams(self, word):
    bigrams = ngrams(word, 2)
    for bigram in bigrams:
      res.append(bigram)
    return res

  def get_bigrams(self, tweet):
    return [i+j for i,j in zip(tweet, tweet[1:])]
    
  def count_frequent_bigram(self, X_train, y_train):
    tweet_by_lan = {}
    labels = np.unique(y_train)
    for label in labels:
      tweet_by_lan[label] = [] 
        
    for tweet, label in zip (list(X_train),list(y_train)):
      tweet_by_lan[label].append(tweet)

    for label in labels:
      tweets = tweet_by_lan[label]
      bigram_freqs = {}
      for tweet in tweets:
        for bigram in self.get_bigrams(tweet):
          if bigram in bigram_freqs:
            bigram_freqs[bigram] += 1
          else:
            bigram_freqs[bigram] = 1
      sorted_bigrams = sorted([(bigram, freq) for bigram, freq in bigram_freqs.items()], key=lambda x: x[1], reverse=True)
      self.bigram_by_lan[label] = sorted_bigrams[:100]

    for label, bigrams in self.bigram_by_lan.items():
      for bigram, freq in bigrams:
        self.sorted_unique_bigrams.append(bigram)

    # create vocaburay    
    self.sorted_unique_bigrams = set(self.sorted_unique_bigrams)
    self.bigram_to_idx = {bigram: i for i, bigram in enumerate(self.sorted_unique_bigrams)}
    self.num_features = len(self.sorted_unique_bigrams)

  def build_feature_vector(self, X_train):
    X_vec = []
    for tweet in X_train: 
      feature_vector = np.zeros(self.num_features)
      for bigram in self.get_bigrams(tweet):
        if bigram in self.bigram_to_idx:
          feature_vector[self.bigram_to_idx[bigram]] += 1
      X_vec.append(feature_vector)
    X_vec = pd.DataFrame(X_vec)
    return X_vec

  def vectorize(self):
    self.count_frequent_bigram(self.df_train['tweet'], self.df_train['label'])
    X_train = self.build_feature_vector(self.df_train['tweet'])
    X_val = self.build_feature_vector(self.df_val['tweet'])
    X_test = self.build_feature_vector(self.df_test['tweet'])

    return X_train, X_val, X_test

  def encoding(self):
    frame = [self.df_train, self.df_val, self.df_test]
    self.df_total = pd.concat(frame, ignore_index = True)
    le = LabelEncoder()
    le.fit(self.df_total['label'])
    y_train = le.transform(self.df_train['label'])
    y_val = le.transform(self.df_val['label'])
    y_test = le.transform(self.df_test['label'])
    num_classes = len(np.unique(self.df_total['label']))
    return y_train, y_val, y_test, num_classes 



In [None]:
# prepare dataset 

ve = Vectorize_and_Encoding(df_train,df_val,df_test)
X_train, X_val, X_test = ve.vectorize()
y_train, y_val, y_test, num_classes = ve.encoding()

num_features = len(X_train.iloc[0])


### 3. Training Model

find the optimal model architecture and training regime for your CNN classifier

In [None]:
class ConvClassifier(nn.Module):
    def __init__(self, kernel, stride, pooling, padding, lin_in, num_classes):
        super(ConvClassifier, self).__init__()
        self.convl1 = nn.Sequential(
            nn.Conv1d(1, 100, kernel_size=kernel, stride=stride, padding=padding),
            nn.ELU(),
            nn.MaxPool1d(kernel_size=pooling, stride=pooling))
        self.convl2 = nn.Sequential(
            nn.Conv1d(100, 1, kernel_size=kernel, stride=stride, padding=padding),
            nn.ELU(),
            nn.MaxPool1d(kernel_size=pooling, stride=pooling))
        self.lin = nn.Linear(lin_in, num_classes)
        
    def forward(self, x):
        out1 = self.convl1(x)
        out2 = self.convl2(out1)
        out2 = out2.reshape(out2.size(0), -1)
        out = self.lin(out2)
        return out

In [None]:
def batch_generator(X_train, y_train, batch_size):
    # initialize the index
    start= 0
    end= batch_size
    # generate batch based on batch size
    while end <= len(X_train):
        yield (X_train.iloc[start:end].values, y_train[start:end])
        start, end = start+batch_size, end+batch_size

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
# Loss and optimizer
device = 'cpu'
num_epochs = 3
num_features = len(X_train.iloc[0])
loss_fun = nn.CrossEntropyLoss()

hypers = [
          {'kernel': 3, 'stride': 1, 'pooling':2, 'padding':0, 'batch_size': 64},
          {'kernel': 3, 'stride': 2, 'pooling':2, 'padding':2, 'batch_size': 128},
          {'kernel': 5, 'stride': 1, 'pooling':2, 'padding':0, 'batch_size': 64},
          {'kernel': 5, 'stride': 1, 'pooling':2, 'padding':0, 'batch_size': 128},
          {'kernel': 5, 'stride': 2, 'pooling':2, 'padding':2, 'batch_size': 256}
] 

res = []
models = []

for hyp in hypers:
    kernel = hyp['kernel']
    stride = hyp['stride']
    pooling = hyp['pooling']
    padding = hyp['padding']
    batch_size = hyp['batch_size']

    # calculate chnnel dimension based on hyperparameters 
    out1_channel = ((((num_features+2*padding)-kernel) // stride)+1) // pooling
    out2_channel = ((((out1_channel+2*padding)-kernel) // stride)+1) // pooling

    #Define model with new hyperparameter and optimizer      
    model = ConvClassifier(kernel=kernel, stride=stride, pooling=pooling, padding=padding, lin_in = out2_channel, num_classes=num_classes)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    #Train the model
    for epoch in range(num_epochs):
      print('--- !! Start training cnn model with hyperprameters condition - {}'.format(hyp))
      for i, (batch_x, batch_y) in enumerate(batch_generator(X_train, y_train, batch_size)):
        #reset the gradients
        optimizer.zero_grad()
          
        # fit the model
        outputs = []
        for tweet in batch_x:                 
          output = model(torch.FloatTensor([[tweet]]).to(device))
          outputs.append(output)
        outputs = torch.stack(outputs)

        #compute the loss
        loss = loss_fun(torch.squeeze(outputs), torch.squeeze(torch.LongTensor(batch_y)).to(device))
            
        #take gradient
        loss.backward()
        optimizer.step()
                
        print ('Epoch {}/{} || batch {}/{} ...... Loss : {:.4f}'.format(epoch+1, num_epochs, i+1, int((len(X_train)/batch_size)+2), loss.item()))
    models.append(model)

    # evaluate model     
    print('---- ## Model validation ')
    y_preds = []
    for x in X_train.values:
        pred = model(torch.FloatTensor([[x]]).to(device))
        label_pred = list(pred[0]).index(max(list(pred[0])))
        y_preds.append(label_pred)
    res.append((accuracy_score(y_train, y_preds), f1_score(y_train, y_preds, average='macro')))

In [None]:
# report the evaluation results per parameters
for i in range(len(hypers)):
  print(100*'=')
  print('hyperparameter condition: {}'.format(hypers[i]))
  print('Accuracy is {:.4f}, f1 max is {:.4f}'.format(res[i][0], res[i][1]))

hyperparameter condition: {'kernel': 3, 'stride': 1, 'pooling': 2, 'padding': 0, 'batch_size': 64}
Accuracy is 0.8893, f1 max is 0.3432
hyperparameter condition: {'kernel': 3, 'stride': 2, 'pooling': 2, 'padding': 2, 'batch_size': 128}
Accuracy is 0.4481, f1 max is 0.0344
hyperparameter condition: {'kernel': 5, 'stride': 1, 'pooling': 2, 'padding': 0, 'batch_size': 64}
Accuracy is 0.8843, f1 max is 0.3273
hyperparameter condition: {'kernel': 5, 'stride': 1, 'pooling': 2, 'padding': 0, 'batch_size': 128}
Accuracy is 0.8548, f1 max is 0.3215
hyperparameter condition: {'kernel': 5, 'stride': 2, 'pooling': 2, 'padding': 2, 'batch_size': 256}
Accuracy is 0.5635, f1 max is 0.1051


# 4. Test
Take the best performing model and evaluate it on the test set. Report the result with confusion matrix

In [None]:
opt_hyp = {'kernel': 3, 'stride': 1, 'pooling': 2, 'padding': 0, 'batch_size': 64}
opt_model = models[0]

y_preds = []

for tweet in X_test.values:
  y_pred = opt_model(torch.FloatTensor([[tweet]]).to(device))
  y_pred = list(y_pred[0]).index(max(list(y_pred[0])))
  y_preds.append(label_pred)

accuracy = accuracy_score(y_test, y_preds)
f1_macro = f1_score(y_test, y_preds, average='macro')

print("Our classifier with optimal hyperparameter")
print("Accuracy is {:.4f}, fi_macro score is {:.4f}".format(accuracy, f1_macro))


Our classifier with optimal hyperparameter
Accuracy is 0.4501, fi_macro score is 0.0365
