# **Library**

In [None]:
#for emoji manipulations with our article texts 
!pip install emoji 
#for contractions substitutions in our article texts 
!pip install contractions
!pip install transformers
!pip install unidecode

In [29]:
#general purpose packages
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

#data processing
import re, string
from re import sub
import emoji
import nltk
import contractions
from unidecode import unidecode


from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


#Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

#transformers
from transformers import BertTokenizerFast
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

#keras
import tensorflow as tf
from tensorflow import keras
import torch


#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

#set seed for reproducibility
seed=42

#set style for plots
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

<Figure size 432x288 with 0 Axes>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
path = "/content/gdrive/My Drive/Colab Notebooks/Thesis/train.csv"
df = pd.read_csv(path)

In [31]:
def conf_matrix(y, y_pred, title):
    fig, ax =plt.subplots(figsize=(5,5))
    labels=['Negative', 'Neutral', 'Positive']
    ax=sns.heatmap(confusion_matrix(y, y_pred), annot=True, cmap="Blues", fmt='g', cbar=False, annot_kws={"size":25})
    plt.title(title, fontsize=20)
    ax.xaxis.set_ticklabels(labels, fontsize=17) 
    ax.yaxis.set_ticklabels(labels, fontsize=17)
    ax.set_ylabel('Test', fontsize=20)
    ax.set_xlabel('Predicted', fontsize=20)
    plt.show()

In [32]:
def normalisation (text):

    ''' Pre process and convert texts to a list of words method '''


    #Clean the text
    replace_content = ''
    text = emoji.demojize(text)# transform from emoji to a string
    for k, v in contractions.contractions_dict.items():
      text = text.replace(k,v)#working word contractions
    text = re.sub(r'(.)\1+', r'\1\1', text)#removing letter repetition
    text = re.sub(r'[\?\.\!]+(?=[\?\.\!])',replace_content, text)#removing punctuation repetition
    text = re.sub('#+', ' ', text)# remove hashtag
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r"_", " ", text)
    text = sub(r"<<", " ", text)
    text = sub(r">>", " ", text)
    text = sub(r"-", " ", text)
    text = sub(r"--", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    # text = text.split()

    return text


In [33]:
#Converting sting value of the output to numeric
df = df[df.sentiment != 'neutral']
sentiment  = {'positive': 1,'negative':-1} 
df.sentiment = [sentiment[item] for item in df.sentiment] 
#removing the unimportant for our analysis column "ID" 
df.pop('id')
df.text = df['text'].apply(unidecode)
df.text = df["text"].str.lower()

In [34]:
df.text = df.text.apply(lambda x: normalisation(x))

In [36]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [37]:
token_lens = []

for txt in df['text'].values:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
    
max_len=np.max(token_lens)

In [39]:
df['token_lens'] = token_lens

In [40]:
df = df.sort_values(by='token_lens', ascending=False)
df.head(20)

Unnamed: 0,text,sentiment,token_lens
8261,v almaty u otdeleniia banka proizoshlo napaden...,-1,512
3778,zhitelei shymkenta chut bylo ne ostavili bez t...,-1,512
4025,fitch ratings moskva/london 20 dekabria 2016 g...,-1,512
4026,v kazakhstane na 33 protsenta uvelichilis dokh...,1,512
4028,v strukture basketboldnogo kluba astana otkryv...,1,512
4030,almaty 25 ianvaria 2017 g stolichnyi filial fo...,1,512
4031,astana 19 oktiabria kazakhstan today v kazakh...,1,512
6871,grigorii garanin 17 ianvaria 2017 13 20 regnum...,-1,512
4034,astana 29 09 2016 8 39 00 nachalo 28 02 2017 ...,1,512
6870,lichnyi trener il i il ina toishan bektthemiro...,-1,512


In [41]:
df = df.sample(frac=1).reset_index(drop=True)

In [42]:
df['sentiment'].value_counts()

 1    2795
-1    1434
Name: sentiment, dtype: int64

In [43]:
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(df['text']).reshape(-1, 1), np.array(df['sentiment']).reshape(-1, 1));
train_os = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text', 'sentiment']);

In [44]:
train_os['sentiment'].value_counts()

 1    2795
-1    2795
Name: sentiment, dtype: int64

In [46]:
X = train_os['text'].values
y = train_os['sentiment'].values


In [47]:
train, test = train_test_split(train_os , test_size=0.2,  stratify=y, random_state=seed)

X_test , y_test = test['text'].values, test['sentiment']

X, y = train['text'].values, train['sentiment']

X_train, X_valid, y_train, y_valid  = train_test_split(X, y, test_size=0.2,  stratify=y, random_state=seed)

In [48]:
y

4939   -1
2702   -1
1991    1
2729   -1
220    -1
       ..
3994    1
1001    1
168     1
4538   -1
2590    1
Name: sentiment, Length: 4472, dtype: int64

In [49]:
y_train_le = y_train.copy()
y_valid_le = y_valid.copy()
y_test_le = y_test.copy()

In [50]:
ohe = preprocessing.OneHotEncoder()
y_train = ohe.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_valid = ohe.fit_transform(np.array(y_valid).reshape(-1, 1)).toarray()
y_test = ohe.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [51]:
print(f"TRAINING DATA: {X_train.shape[0]}\nVALIDATION DATA: {X_valid.shape[0]}\nTESTING DATA: {X_test.shape[0]}" )

TRAINING DATA: 3577
VALIDATION DATA: 895
TESTING DATA: 1118


In [52]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla V100-SXM2-16GB


# **Naive Bayes Classifier**

In [53]:
clf = CountVectorizer()
X_train_cv =  clf.fit_transform(X_train)
X_test_cv = clf.transform(X_test)

In [54]:
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)
X_train_tf = tf_transformer.transform(X_train_cv)
X_test_tf = tf_transformer.transform(X_test_cv)

In [55]:
nb_clf = MultinomialNB()

In [56]:
nb_clf.fit(X_train_tf, y_train_le)

MultinomialNB()

In [57]:
nb_pred = nb_clf.predict(X_test_tf)

In [58]:
print('\tClassification Report for Naive Bayes:\n\n',classification_report(y_test_le,nb_pred, target_names=['Negative', 'Positive']))

	Classification Report for Naive Bayes:

               precision    recall  f1-score   support

    Negative       0.89      0.91      0.90       559
    Positive       0.90      0.89      0.90       559

    accuracy                           0.90      1118
   macro avg       0.90      0.90      0.90      1118
weighted avg       0.90      0.90      0.90      1118

