In [None]:
%%capture
!pip install transformers==3.1.0

In [None]:
# Import the Transformers pipeline library
from transformers import pipeline

# Preprocessing and visualization libraries
import plotly.express as px
import plotly
import pandas as pd 
import numpy as np

# To unzip folder
import zipfile
from os import listdir
from os.path import join, splitext

import plotly.offline as pyo
import plotly.graph_objs as go
#plotly.offline.init_notebook_mode(connected=True)

# **Main Components** 

This section covers the main components of the project, such as Data Preprocessing and Model Evaluation

## **Data Preprocessor**

In [None]:
def convert_tsv_to_pandas(file_name, path_raw):

  complete_path = join(path_raw, file_name)
  #df = pd.read_csv(path_raw+str(file_name),sep='\t')
  df = pd.read_csv(complete_path ,sep='\t')
  df.columns  = ["label", "text"]
  
  return df

def combine_all_files(path_raw, files_ext = '.tsv'):

  all_files = listdir(path_raw)
  list_all_dataframes = []

  for file_name in all_files:

    # Check the extention of the file
    if(file_name.endswith(files_ext)):      
      
      # Create pandas dataframe
      current_df = convert_tsv_to_pandas(file_name, path_raw)

      # Get the name of the file without the extension
      language  = splitext(file_name)[0] 
      current_df['language'] = str(language)
      
      list_all_dataframes.append(current_df)

  # Concate all the dataframes
  final_df = pd.concat(list_all_dataframes)

  return final_df


def unzip_to_destination(dest_unzip_folder, src_zip_file):

  with zipfile.ZipFile(src_zip_file, 'r') as zip_ref:
    zip_ref.extractall(dest_unzip_folder)


def data_preprocessor(src_zip_file, raw_path, preprocessed_path, 
                      processed_file_name, 
                      files_ext = '.tsv', final_ext = '.csv'):

  # Unzip the main folder to the raw data folder 
  unzip_to_destination(raw_path, src_zip_file)

  # Get file name without extension 
  new_raw_path = join(raw_path ,splitext(src_zip_file)[0]) 

  # Create a single pandas dataframe for all the files 
  combined_pandas = combine_all_files(new_raw_path, files_ext)

  # Convert 0 1 to negative and positive and change the label column
  labels = {0: 'negative', 1: 'positive'}
  combined_pandas['label'] = combined_pandas['label'].map(labels)

  # Save the final file to destination
  complete_preprocessed_path = join(preprocessed_path, processed_file_name+final_ext)
  combined_pandas.to_csv(complete_preprocessed_path, encoding='utf-8', index=False)

  print("Preprocessing Complete!")

The following command line creates the required folders into the working directory

In [None]:
%%bash
mkdir -p 'data/raw' 'data/processed'

In [None]:
### Test of the unzip 
#dest_unzip_folder =  "data/raw"
#src_zip_file = "data.zip" 
#unzip_to_destination(dest_unzip_folder, src_zip_file)
print()

data/raw/data


In [None]:
## Test Combination of files
#combined_df = combine_all_files('data/raw/data')
#combined_df.shape

In [None]:
## Test Complete Preprocessing
data_preprocessor('data.zip', 'data/raw', 'data/processed', 'combined_files')

Preprocessing Complete!


In [None]:
# Read the preprocessed file
combined_files = pd.read_csv("data/processed/combined_files.csv")
combined_files.sample(5)

Unnamed: 0,label,text,language
5160,positive,был в данном отеле на одну ночь . рядом метро ...,russian
6306,positive,проживала в отеле пару дней . отличный сервис ...,russian
3030,positive,wir waren mit einer gruppe von 50 personen dor...,german
3716,positive,"wer die promis münchen , die spieler des fcb b...",german
6083,negative,"удивляют положительные отзывы о данном "" отеле...",russian


## **Data Exploration**

In [None]:
def show_distribution(df, col_interest, title):

  new_df = df[col_interest].value_counts().rename_axis(col_interest).reset_index(name='counts')

  fig = px.bar(new_df, x= col_interest, y="counts", color="counts", 
              title = title)
  fig.show()

def show_random_text(df, N=3):

  random_rows = df.sample(N)

  for index in range(len(random_rows)):

    print("Index: {}\n".format(index))
    print("Textual data: {}\n".format(random_rows.iloc[index]['text']))
    print("Underlying Sentiment: {}\n\n".format(random_rows.iloc[index]['label']))

#### **Language Distribution**

In [None]:
# Get the porportion of languages in the dataset 
show_distribution(combined_files, 'language', 'Number of occurence per Language')

#### **Sentiment distribution Per Language**

##### **English**

In [None]:
en_df = combined_files[combined_files['language']=='english']

show_distribution(en_df, 'label', 'Distribution of Sentiment for English')

In [None]:
# Random Text
show_random_text(en_df)

Index: 0

Textual data: not kosher ! ! ! ignore the symbol ! ! ! they are not kosher and not certified by any company ! ! ! i contacted the company to confirm.

Underlying Sentiment: negative


Index: 1

Textual data: grandpa po &apos;s originals are crunchy nuggets made from organic popcorn and organic soybeans. the healthy snack offers the benefits of soy without genetically modified organisms , cholesterol , peanuts or preservatives. in addition , the snack is low in saturated fat and it tastes great.

Underlying Sentiment: positive


Index: 2

Textual data: i love this sauce ! it is sweet and molasses-y , with a tiny kick of heat. i use it to make hawaiian chicken or pork all the time. i &apos;ve never had authentic jamaican jerk , so i can &apos;t vouch for accuracy there , but my whole family loves this one : d

Underlying Sentiment: positive




##### **French** 

In [None]:
fr_df = combined_files[combined_files['language']=='french']
show_distribution(fr_df, 'label', 'Distribution of Sentiment for French')

In [None]:
# Random Text
show_random_text(fr_df)

Index: 0

Textual data: cette série est superbe , et c'est pas souvent qu'on arrive à ressentir des émotions pareilles ! l'histoire est super , les acteurs aussi et la façon dont ils font ressortir leurs personnages est magique , les scénaristes ont du se creuser pour écrire ça , mais le résultat est là : je suis scotchée devant ma télé .

Underlying Sentiment: positive


Index: 1

Textual data: quand le voisin fox essaie de copier sur son voisin abc , voilà le piètre résultat que ça donne ... quintuplés n'est qu'une copie des sauvages , et une copie bâclée en plus ...

Underlying Sentiment: negative


Index: 2

Textual data: série qui nous endors , j'ai déjà regardé des séries nian-nian à l'eau de rose et ça me fesait plutot marrer , ça passait le temps mais là c'est une horreur , il ne se passe rien , les personnages sont niais sans aucune personnalité ... enlevez vite cette immondisse de notre télévision ! ! !

Underlying Sentiment: negative




##### **Russian**

In [None]:
ru_df = combined_files[combined_files['language']=='russian']
show_distribution(ru_df, 'label', 'Distribution of Sentiment for Russian')

In [None]:
# Random Text
show_random_text(ru_df)

Index: 0

Textual data: спасибо большое отелю за хорошую организацию . в октябре арендовали зал для проведения тренинга для команды из 20 человек . все скромно , но достойно . менеджер андрей оперативно реагировал на все наши просьбы . например : произошла нестыковка кофе-брейка по времени , нужно было организовать перерыв раньше - через 5 мин все было накрыто . нужно было передвинуть столы - без проблем . кондиционер включить / выключить - сразу откликались . порадовало наличие бесплатной парковки . спасибо !

Underlying Sentiment: positive


Index: 1

Textual data: маленькие неудобные номера с плохой шумоизоляцией не соответствуют стоимости 5900 р . в сутки . белье и полотенца со следами продолжительного использования , наволочки дырявые . душ течет . завтрак - шведский стол , однообразно и тесно . лестница на второй этаж отеля ( третий этаж здания ) с деревянными ступеньками откровенно опасна для проживающих . в номере исправно работающие холодильник и телевизор . при заселении в но

##### **Spanish** 

In [None]:
sp_df = combined_files[combined_files['language']=='spanish']
show_distribution(sp_df, 'label', 'Distribution of Sentiment for Spanish')

In [None]:
# Random Text
show_random_text(sp_df)

Index: 0

Textual data: muy recomendable .

Underlying Sentiment: positive


Index: 1

Textual data: estuve en el otro , el q-art , y sal muy descontento del local en el colette hay zona de no fumadores ?

Underlying Sentiment: positive


Index: 2

Textual data: comida abundante , buena relacin calidad-precio si pides entrante + segundo se puede cenar por unos 12 euros

Underlying Sentiment: positive




##### **German**

In [None]:
ge_df = combined_files[combined_files['language']=='german']
show_distribution(ge_df, 'label', 'Distribution of Sentiment for German')

In [None]:
# Random Text
show_random_text(ge_df)

Index: 0

Textual data: waren am dienstag zum special " candle light " - 2 pizzen und eine flasche wein für 29,90 - vor ort . man sitzt angenehm , die pizza war frisch und lecker und der rotwein , ein montepulcciano , ebenfalls lecker . wir kommen wieder .

Underlying Sentiment: positive


Index: 1

Textual data: ambiente eines billigen strandclubs in der türkei , der nachbar sitzt fast auf dem schoss weil es eng ist , die musik laut und unpassend ( fetenhits 80er ) , gedränge und warme getränke die man gewöhnlich kalt trinkt . der eingang wird von 2 arroganten kleinen mädchen bedient , die sich auf irgendetwas was einbilden , unklar auf was . dazu gehen im laden afrikanische prostituierte auf männerfang . achja das essen : zu teuer , aber gut . für 1/3 des preises in anderen lokalen anzurufen . fazit : viel lärm um nichts

Underlying Sentiment: negative


Index: 2

Textual data: wir waren mit unseren nun schon fast großen töchtern ( 10 ) zu einem tagesausflug nach münchen aufgebrochen

## **Model Evaluation**

In [None]:
# the model from Hugging Face Transformers
zsmlc_classifier = pipeline("zero-shot-classification", 
                            model='joeddav/xlm-roberta-large-xnli')

In [None]:
# Candidate labels 
#candidate_labels = list(combined_files.label.unique())

In [None]:
def make_prediction(clf_result):

  # Get the index of the maximum probability score
  max_index = np.argmax(clf_result["scores"])
  predicted_label = clf_result["labels"][max_index]

  return predicted_label


def run_batch_prediction(original_data, label_column='label', text_column = 'text', 
                         my_classifier = zsmlc_classifier):

  # Make a copy of the data
  data_copy = original_data.copy()

  # The list that will contain the models predictions
  final_list_labels = []

  for index in range(len(original_data)):
    # Run classification
    sequences = original_data.iloc[index][text_column]
    candidate_labels = list(original_data[label_column].unique())
    result = my_classifier(sequences, candidate_labels, multi_class = True)

    # Make prediction
    final_list_labels.append(make_prediction(result))
  
  # Create the new column for the predictions
  data_copy["model_labels"] = final_list_labels

  return data_copy

### **Get Sample Data For Evaluation.** 
I decided to get few samples of each data so that the predictions do not take ages to finish :( 

In [None]:
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [None]:
def get_data_sample(df, language, sample_size = 100):

  lang_df = df[df['language']==language].sample(sample_size)

  return lang_df


def get_performance(df):

  performance = {}

  performance["accuracy"] = accuracy_score(df["label"], df["model_labels"])

  report = classification_report(df["label"], df["model_labels"], output_dict=True)
  
  performance["f1_score"] = report['macro avg']["f1-score"]

  return performance

def predictions_evaluation(data):

  predictions = run_batch_prediction(data)

  return get_performance(predictions)

#### **Evaluation on English**

In [None]:
en_sample_data = get_data_sample(combined_files, 'english')
pred_eval = predictions_evaluation(en_sample_data)
print(pred_eval)

{'accuracy': 0.94, 'f1_score': 0.9379909053327822}


#### **Evaluation on French**

In [None]:
fr_sample_data = get_data_sample(combined_files, 'french')
pred_eval = predictions_evaluation(fr_sample_data)
print(pred_eval)

{'accuracy': 0.91, 'f1_score': 0.90292309351742}


#### **Evaluation on Russian**

In [None]:
ru_sample_data = get_data_sample(combined_files, 'russian')
pred_eval = predictions_evaluation(ru_sample_data)
print(pred_eval)

{'accuracy': 0.89, 'f1_score': 0.8677725688183677}


#### **Evaluation on Spanish**

In [None]:
sp_sample_data = get_data_sample(combined_files, 'spanish')
pred_eval = predictions_evaluation(sp_sample_data)
print(pred_eval)

{'accuracy': 0.85, 'f1_score': 0.7699033594109527}


#### **Evaluation on German**

In [None]:
ge_sample_data = get_data_sample(combined_files, 'german')
pred_eval = predictions_evaluation(ge_sample_data)
print(pred_eval)

{'accuracy': 0.96, 'f1_score': 0.948024948024948}


In [None]:
#en_sample_data.language.unique()