In [None]:
import csv
import re
import numpy as np
import pandas as pd
import sklearn
from sklearn.pipeline import Pipeline
from io import StringIO
import requests
from nltk.tokenize import word_tokenize
import nltk
from sklearn.preprocessing import LabelEncoder
nltk.download('punkt')
from sklearn.preprocessing import Normalizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### DataLoader

In [None]:
class DataLoader:
    """Download the files and load them into a dataframe.
    
    Attributes
    ----------
    url: Url of the online dataset
    """
    def __init__(self, url):
        self.url = url

    def load_dataset(self):
        r = requests.get(self.url)
        data = r.content.decode('utf8')
        df = pd.read_csv(StringIO(data), sep='\t')
        df.columns = ['tweet', 'label']
        return df

### TweetCleaner

In [None]:
class TweetCleaner:
    """Clean tweet text data.

    This Cleaner performes text pre-processing through a series of 
    operations while creating a NLP application. The operations include
    Lowercasing and Removing URLs, user id, symbol & emoji.

    Attributes
    ----------
    text: series of length n_tweets
    """
     
    def __init__(self, text):
        self.text = text

    def lowercase(self, column):
        column = column.str.lower()
        return column

    def convert_lowercase(self):
        """Convert words into lowercase.

        Examples
        --------
        Before applying lower casing: Fuck c'mon Argentina tie the game please 😭😭😭
        After applying lower casing : fuck c'mon argentina tie the game please 😭😭😭
        """
        return self.lowercase(self.text)

    def url(self, row):
        re_url = re.compile('https?://\S+|www\.\S+')
        return re_url.sub('', row)

    def delete_url(self):
        """Remove URLs by using regular expressions.

        Examples
        --------
        Text before removing URL: توجيه كيفية تثبيت البرامج الثابتة rom التحميل لسامسونج http://t.co/8qrpxfgyyw
        Text after removing URL: توجيه كيفية تثبيت البرامج الثابتة rom التحميل لسامسونج
        """
        return self.text.apply(self.url)

    def usr(self, row):
        re_url = re.compile('@\S+')
        return re_url.sub('', row)

    def remove_usr(self):
        """Remove user ids by using regular expressions.

        Examples
        --------
        Text before removing USR: @dinaa_elaraby اها يا بيبي والله اتهرست علي تويتر و ع الفيس و كله -.-
        Text after removing USR:  اها يا بيبي والله اتهرست علي تويتر و ع الفيس و كله -.-
        """
        return self.text.apply(self.usr)

    def special(self, row):
        x=''
        for i in row:
            if i.isalpha():
                x = x + i
            else:
                x = x + ' '
        return x

    def remove_special(self):
        """Remove punctuations, numbers and emojis.

        Examples
        --------
        Text before removing punctuations & emojis: fuck c'mon argentina tie the game please 😭😭😭
        Text after removing punctuations & emojis: fuck c mon argentina tie the game please    
        """
        return self.text.apply(self.special)   

    def quick_processing(self, ):
        """Integerate pre-processing tools for quick execution 
        without showing step-by-step examples.
        """
        self.text = self.convert_lowercase()
        self.text = self.delete_url()
        self.text = self.remove_usr()
        self.text = self.remove_special()
        # self.text = self.tokenize_word()
        print('Data Cleaning Done.')
        return self.text


In [None]:
url_train_dev = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTOZ2rC82rhNsJduoyKYTsVeH6ukd7Bpxvxn_afOibn3R-eadZGXu82eCU9IRpl4CK_gefEGsYrA_oM/pub?gid=1863430984&single=true&output=tsv'
url_test = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vT-KNR9nuYatLkSbzSRgpz6Ku1n4TN4w6kKmFLkA6QJHTfQzmX0puBsLF7PAAQJQAxUpgruDd_RRgK7/pub?gid=417546901&single=true&output=tsv'

train_loader = DataLoader(url_train_dev)
df_train_dev = train_loader.load_dataset()

test_loader = DataLoader(url_test)
df_test = test_loader.load_dataset()

In [None]:
p1 = TweetCleaner(df_train_dev['tweet'])
df_train_dev['tweet'] = p1.quick_processing()

p2 = TweetCleaner(df_test['tweet'])
df_test['tweet'] = p2.quick_processing()

Data Cleaning Done.
Data Cleaning Done.


### Step-by-step Excution
In case you want to check the data cleaning operations with each function's example output, you need to comment out the quick_processing method in the last cell and execute this program from the beginning.

In [None]:
# print(f"Text before removing URL: {df_train_dev['tweet'][5]}")
# p = TweetCleaner(df_train_dev['tweet'])
# df_train_dev['tweet'] = p.delete_url()

# print(f"Text after removing URL: {df_train_dev['tweet'][5]}")

In [None]:
# print(f"Text before removing USR: {df_train_dev['tweet'][15]}")

# df_train_dev['tweet'] = p.remove_usr()

# print(f"Text after removing USR: {df_train_dev['tweet'][15]}")

In [None]:
# print(f"Text before removing symbol & emoji: {df_train_dev['tweet'][10000]}")

# df_train_dev['tweet'] = p.remove_special()

# print(f"Text after removing symbol & emoji: {df_train_dev['tweet'][10000]}")

In [None]:
# print(f"Text before removing symbol & emoji: {df_train_dev['tweet'][32630:32680]}")

# df_train_dev['tweet'] = p.tokenize_word()

# print(f"Text after removing symbol & emoji: {df_train_dev['tweet'][32630:32680]}")

### Feature Extractor

In [None]:
class FeatureExtractor:
    """Extract feature from text data.

    This Extractor performs feature extraction using TF_IDF(term 
    frequency–inverse document frequency). It aims to quantify the 
    importance of a given word relative to other words in the document 
    and in the corpus.

    Attributes
    ----------
    text: series of length n_tweets
    """

    def __init__(self, text):
        self.tfidf = TfidfVectorizer(max_df = 0.3, max_features = 500 )
        self.text = text

    def extract_feature(self):
        return self.tfidf.fit_transform(self.text)

    def get_feature_names(self):
        return self.tfidf.get_feature_names_out(self.text)

In [None]:
train_extractor = FeatureExtractor(df_train_dev['tweet'])
test_extractor = FeatureExtractor(df_test['tweet'])

X_train = train_extractor.extract_feature()
X_test = test_extractor.extract_feature()

features = train_extractor.get_feature_names()

### Pipeline

In [None]:
# from sklearn.pipeline import Pipeline

# preprocess = Pipeline(steps = [
#     ('data_loader', DataLoader()),
#     ('tweet_cleaner',TweetCleaner()),
#     ('feature_extractor', FeatureExtractor()),
# ],verbose=True)

In [None]:
# features

In [None]:
from sklearn.model_selection import train_test_split

y_train = df_train_dev['label']
y_test = df_test['label']

X, X_valid, y, y_valid = train_test_split(X_train, y_train, test_size=0.1)

### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

lr = LogisticRegression(penalty = 'l2', solver = 'lbfgs', max_iter = 200)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

report = classification_report(y_test, y_pred)
score = accuracy_score(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)

print("Accuracy: ", score)
print(report)
print(matrix)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.5264703667444838
              precision    recall  f1-score   support

          ar       0.86      0.61      0.71       529
     ar_LATN       0.00      0.00      0.00         3
          az       0.00      0.00      0.00         2
          bg       0.00      0.00      0.00         2
          bs       0.00      0.00      0.00         1
          ca       0.00      0.00      0.00         3
          cs       0.00      0.00      0.00         1
          da       0.00      0.00      0.00         1
          de       0.00      0.00      0.00        50
          el       0.00      0.00      0.00        11
          en       0.66      0.76      0.71      4758
          es       0.43      0.27      0.33      1476
          eu       0.00      0.00      0.00         2
          fa       0.00      0.00      0.00         5
          fi       0.00      0.00      0.00         8
          fr       0.10      0.07      0.08       224
          he       0.00      0.00      0.00        14
        

  _warn_prf(average, modifier, msg_start, len(result))


### GridSearchCV for LR
Takes a long time to excute (not advise to try).

In [None]:
# parameters = {'penalty':('l2', 'none'), 'solver':('newton-cg', 'lbfgs', 'sag', 'saga'), }
# clf = GridSearchCV(lr, parameters)
# clf.feature_names_in_ = features
# clf.fit(X_train, y_train)
# clf.best_params_

In [None]:
import eli5
eli5.show_weights(lr, top=10,targets=['en','es','ja'], feature_names = features )

Weight?,Feature,Unnamed: 2_level_0
Weight?,Feature,Unnamed: 2_level_1
Weight?,Feature,Unnamed: 2_level_2
+4.955,<BIAS>,
+4.694,you,
+4.672,that,
+4.292,it,
+4.176,for,
+4.172,just,
+4.142,and,
+3.909,my,
+3.859,this,
+3.852,what,

Weight?,Feature
+4.955,<BIAS>
+4.694,you
+4.672,that
+4.292,it
+4.176,for
+4.172,just
+4.142,and
+3.909,my
+3.859,this
+3.852,what

Weight?,Feature
+5.343,el
+4.896,que
+4.700,gracias
+4.634,las
+4.283,los
+4.276,quiero
+4.034,una
+4.003,voy
+3.894,estoy
+3.893,por

Weight?,Feature
+6.757,<BIAS>
+3.326,定期
… 8 more positive …,… 8 more positive …
… 483 more negative …,… 483 more negative …
-2.689,mtvhottest
-2.746,da
-2.759,ya
-2.789,la
-2.864,na
-2.873,haha


In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(random_state=1).fit(X_train, y_train)
y_pred2 = mlp.predict(X_test)

report2 = classification_report(y_test, y_pred)
score2 = accuracy_score(y_test, y_pred2)
matrix2 = confusion_matrix(y_test, y_pred)

print(score2)
print(report2)
print(matrix2)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.5044054522177874
              precision    recall  f1-score   support

          ar       0.86      0.61      0.71       529
     ar_LATN       0.00      0.00      0.00         3
          az       0.00      0.00      0.00         2
          bg       0.00      0.00      0.00         2
          bs       0.00      0.00      0.00         1
          ca       0.00      0.00      0.00         3
          cs       0.00      0.00      0.00         1
          da       0.00      0.00      0.00         1
          de       0.00      0.00      0.00        50
          el       0.00      0.00      0.00        11
          en       0.66      0.76      0.71      4758
          es       0.43      0.27      0.33      1476
          eu       0.00      0.00      0.00         2
          fa       0.00      0.00      0.00         5
          fi       0.00      0.00      0.00         8
          fr       0.10      0.07      0.08       224
          he       0.00      0.00      0.00        14
        

  _warn_prf(average, modifier, msg_start, len(result))


### GridSearchCV for MLP
Takes a long time to excute (not advise to try).

In [None]:
# parameters = {'hidden_layer_sizes':((50,), (100,),(150,)), 
#             'solver':('lbfgs', 'sgd', 'adam'), 
#             'early_stopping':(True, False),      
#             }

# clf2 = GridSearchCV(mlp, parameters)
# clf2.feature_names_in_ = features
# clf2.fit(X_train, y_train)

In [None]:
matrix

array([[ 322,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    2,
           0,  203,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    2,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    2,    0,    0,   

In [None]:
matrix2

array([[ 322,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    2,
           0,  203,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    2,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    2,    0,    0,   