In [None]:
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# our library
from data_loader import MyDataLoader

# Pre-processing Functions

In [None]:
def transform_remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def transform_remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def transform_remove_usernames(text):
    uh = re.compile(r'([@][A-Za-z0-9_]+)|(\w+:\/\/\S+)')
    return uh.sub(r'', text)


def transform_remove_hashtags(text):
    return re.sub(r'#\w+', ' ', text)


def transform_remove_digits(text):
    return re.sub(r'\d+', ' ', text)


def transform_remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", re.UNICODE)
    return emoji_pattern.sub(r' ', text)


def transform_lowercase(text):
    return text.lower()


def transform_fix_i(text):
    fix = re.compile(r'i̇')
    return fix.sub(r'i', text)


def transform_fix_whitespace(text):
    return ' '.join(text.split())


# Load the datasets

In [None]:
data_loader = MyDataLoader('train.csv', 'test.csv')

In [None]:
transforms = [
  transform_remove_URL,
  transform_remove_html,
  transform_remove_usernames,
  #transform_remove_hashtags,
  transform_remove_emoji,
  # transform_remove_digits,
  transform_lowercase,
  transform_fix_i,
  transform_fix_whitespace,
]

data_loader.apply_functions(transforms)

# Normalization

Start a [Zemberek gRPC server](https://github.com/ahmetaa/zemberek-nlp/tree/master/grpc) if you want to apply normalization

In [None]:
# from norm import Normalizer

# normalizer = Normalizer()
# transforms = [normalizer.normalize]

# data_loader.apply_functions(transforms)

# Encode Labels

In [None]:
tweets_train, y_train = data_loader.get_original_train()
tweets_test, y_test = data_loader.get_test()

In [None]:
le = LabelEncoder()
le.fit(y_train.tolist())

y_train = le.transform(y_train.tolist())
y_test = le.transform(y_test.tolist())

y_train = y_train.reshape(-1,1).ravel()
y_test = y_test.reshape(-1,1).ravel()

# Model Training

## Embeddings

In [None]:
model_choice = 'bert'
# model_choice = 'w2v'

In [None]:
import representation_model as repm

In [None]:
if model_choice == 'bert':
    X_train = repm.calc_bert_represenetations(tweets_train)
    X_test = repm.calc_bert_represenetations(tweets_test)
else:
    X_train = repm.calc_w2v_representations(tweets_train)
    X_test = repm.calc_w2v_representations(tweets_test)

## Classificaiton Model

In [None]:
model = SVC()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred_text = le.inverse_transform(y_pred)
y_test_text = le.inverse_transform(y_test)

In [None]:
print(classification_report(y_test_text, y_pred_text, digits = 4))