# Librerias

In [1]:
import pandas as pd
import numpy as np
#import nltk
import spacy
import re
import unicodedata
nlp = spacy.load('es_core_news_sm')
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

# data

In [3]:
df=pd.read_csv('data/train.csv')

# Filter

In [4]:
temp1=df[df['label_quality']=='unreliable']['category'].tolist()
temp2=df[df['label_quality']=='reliable']['category'].tolist()

In [5]:
cat_not_in_reliable=list(set(temp1) - set(temp2))

In [6]:
cat_count=df[(df['label_quality']=='reliable')|(df['category'].isin(cat_not_in_reliable))]['category'].value_counts()

In [7]:
cat_low_count=cat_count[cat_count<4000].index.tolist()

In [8]:
df=df[(df['label_quality']=='reliable')|(df['category'].isin(cat_not_in_reliable))|(df['category'].isin(cat_low_count))]

In [9]:
#df_spanish    = df[df['language']=='spanish']
#df_spanish=df_spanish.groupby('category').filter(lambda x:len(x)>=20)
#counts_spanish    = df_spanish['category'].value_counts()

all_cat=df['category'].value_counts()

all_cat=all_cat[all_cat<=400].index.tolist()

df_spanish_temp        = df[df['language']=='spanish']
counts_spanish         = df_spanish_temp['category'].value_counts()
low_spanish            =counts_spanish[counts_spanish<200]

df_spanish=df[(df['language']=='spanish')|\
              (~df['category'].isin(counts_spanish.index))|\
              (df['category'].isin(all_cat))|\
              (df['category'].isin(low_spanish.index))]

# ML

## Preprocessing

In [10]:
def clean_text(text) :
    text=unicodedata.normalize('NFKD', str(text)).encode('ascii', errors='ignore').decode('utf-8')\
    .lower().replace(r'\\n','').replace(r'-',' ').replace(r'.','').strip()
    text = re.sub("[^a-zA-Z]", " ", str(text))
    return " ".join(text.split())
    #return " ".join([w for w in text.split() if len(w)>1])

In [11]:
def lemmatization(text):
    txt = nlp(text)
    lemmatize_sentence=[]
    for token in txt:
        lemmatize_sentence.append(token.lemma_)
    return " ".join(lemmatize_sentence)

In [12]:
#spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS

## sample

In [13]:
sampling_spanish={}
counts_spanish  = df_spanish['category'].value_counts()
for cat in counts_spanish.index:
    if counts_spanish.loc[cat]>2200:
        sampling_spanish[cat]=2200
    else:
        sampling_spanish[cat]=counts_spanish.loc[cat]

In [14]:
from imblearn.pipeline import Pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE,BorderlineSMOTE
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD

In [15]:
from sklearn.metrics import balanced_accuracy_score

In [16]:
X_spanish = df_spanish['title']
y_spanish = df_spanish['category']

In [17]:
rus_spanish = RandomUnderSampler(sampling_strategy=sampling_spanish,random_state=42)
X_res_spanish, y_res_spanish = rus_spanish.fit_resample(X_spanish.values.reshape(-1, 1), y_spanish)

In [18]:
#SPANISH
X_res_spanish=pd.DataFrame(X_res_spanish)
X_res_spanish.columns=['title']
X_res_spanish['title']=X_res_spanish['title'].astype('str')
y_res_spanish=pd.Series(y_res_spanish)

In [19]:
print(y_res_spanish.value_counts())

MOTORCYCLE_OIL_PUMPS                    2200
EASTER_EGGS                             2200
TABLE_CLOCKS                            2200
NETWORK_CABLES                          2200
FACE_MASKS                              2200
                                        ... 
CONSTRUCTION_LIME_BAGS                   206
COLD_FOOD_AND_DRINK_VENDING_MACHINES     162
PAINTBALL_SMOKE_GRENADES                 154
COMMERCIAL_POPCORN_MACHINES              141
HAMBURGER_FORMERS                        109
Length: 1588, dtype: int64


In [20]:
print('y_res_spanish :',y_res_spanish.nunique())

y_res_spanish : 1588


## lemmatization

In [21]:
#X_res_spanish['title']=X_res_spanish['title'].apply(lemmatization)

## CLEAN

In [22]:
X_res_spanish['title']=X_res_spanish['title'].apply(clean_text)

In [23]:
X_res_spanish.head()

Unnamed: 0,title
0,lg d glasses
1,g bt bluetooth d gafas obturador activas para ...
2,gafas dultra clear hd hz dlp link d gafas de o...
3,el gafas d con obturador activo compatibles con g
4,marco metal vendimia circulo ronda lente lente...


## train_test_split

In [24]:
X_train_spanish, X_test_spanish, y_train_spanish, y_test_spanish = train_test_split(X_res_spanish['title'],
                                                                                    y_res_spanish,
                                                                                    test_size=0.46,
                                                                                    random_state=42,
                                                                                    stratify=y_res_spanish)

In [25]:
print('X_train_spanish : ',X_train_spanish.shape)

X_train_spanish :  (1597357,)


In [26]:
print('y_train_spanish : ',y_train_spanish.shape)

y_train_spanish :  (1597357,)


In [27]:
print('y_test_spanish : ',y_test_spanish.nunique())

y_test_spanish :  1588


In [28]:
print('y_test_spanish : \n',y_test_spanish.value_counts())

y_test_spanish : 
 MOTORCYCLE_OIL_PUMPS                    1012
LOAFERS_AND_OXFORDS                     1012
FALSE_EYELASHES                         1012
POLYFUSION_DEVICES                      1012
BREAKFASTS_AND_AFTERNOON_TEAS           1012
                                        ... 
CONSTRUCTION_LIME_BAGS                    95
COLD_FOOD_AND_DRINK_VENDING_MACHINES      75
PAINTBALL_SMOKE_GRENADES                  71
COMMERCIAL_POPCORN_MACHINES               65
HAMBURGER_FORMERS                         50
Length: 1588, dtype: int64


## MultinomialNB

In [29]:
bayes_text_clf =  make_pipeline_imb([('CountVectorizer', CountVectorizer(encoding='latin-1',
                                                               max_df=0.90,
                                                               min_df=10,
                                                               max_features=200000,
                                                               #stop_words=list(spacy_stopwords),
                                                              )),
                                     ('SMOTE',SMOTE(random_state=42)),
                                     ('MultinomialNB',  MultinomialNB())])

In [30]:
bayes_spanish_clf=bayes_text_clf.fit(X_train_spanish,y_train_spanish)

In [31]:
y_test_spanish.nunique()

1588

In [32]:
X_test_spanish.shape

(1360713,)

In [33]:
y_pred_spanish = bayes_spanish_clf.predict(X_test_spanish.head(X_test_spanish.shape[0]//2))
balanced_accuracy_score(y_test_spanish.head(X_test_spanish.shape[0]//2), y_pred_spanish)

0.8122125475641203

In [34]:
0.8122125475641203

0.8122125475641203

In [35]:
y_pred_spanish = bayes_spanish_clf.predict(X_test_spanish.tail(X_test_spanish.shape[0]//2))
balanced_accuracy_score(y_test_spanish.tail(X_test_spanish.shape[0]//2), y_pred_spanish)

0.8111589669732492

In [36]:
0.8120374590842147

0.8120374590842147

In [37]:
pd.Series(y_pred_spanish).nunique()

1588

## Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
lr_text_clf =  make_pipeline_imb([('TfidfVectorizer', TfidfVectorizer(encoding='latin-1',
                                                               max_df=0.90,
                                                               min_df=5,
                                                               max_features=100000,
                                                               #stop_words=list(spacy_stopwords),
                                                              )),
                                     ('SMOTE',SMOTE(random_state=42)),
                                     ('LogisticRegression',  LogisticRegression())])

In [34]:
lr_clf=lr_text_clf.fit(X_train_spanish,y_train_spanish)

In [35]:
y_test_spanish.nunique()

1588

In [36]:
X_test_spanish.shape

(1066605,)

In [37]:
y_pred_spanish = lr_clf.predict(X_test_spanish.head(X_test_spanish.shape[0]//2))
balanced_accuracy_score(y_test_spanish.head(X_test_spanish.shape[0]//2), y_pred_spanish)

0.829766210668563

In [38]:
y_pred_spanish = lr_clf.predict(X_test_spanish.tail(X_test_spanish.shape[0]//2))
balanced_accuracy_score(y_test_spanish.tail(X_test_spanish.shape[0]//2), y_pred_spanish)

0.8305113042086972

In [39]:
pd.Series(y_pred_spanish).nunique()

1588

## SVM Linear CountVectorizer

In [24]:
from sklearn.svm import LinearSVC

In [24]:
LinearSVC_text_clf =  make_pipeline_imb([('CountVectorizer', CountVectorizer(encoding='latin-1',
                                                               max_df=0.90,
                                                               min_df=5,
                                                               max_features=100000,
                                                               #stop_words=list(spacy_stopwords),
                                                              )),
                                     #('SMOTE',SMOTE(random_state=42)),
                                     ('LinearSVC',  LinearSVC())])

In [25]:
LinearSVC_spanish_clf=LinearSVC_text_clf.fit(X_train_spanish,y_train_spanish)

In [33]:
y_test_spanish.nunique()

1560

In [28]:
X_test_spanish.shape

(1284403,)

In [26]:
y_pred_spanish = LinearSVC_spanish_clf.predict(X_test_spanish.head(X_test_spanish.shape[0]//2))
balanced_accuracy_score(y_test_spanish.head(X_test_spanish.shape[0]//2), y_pred_spanish)

0.8322812746417401

In [27]:
y_pred_spanish = LinearSVC_spanish_clf.predict(X_test_spanish.tail(X_test_spanish.shape[0]//2))
balanced_accuracy_score(y_test_spanish.tail(X_test_spanish.shape[0]//2), y_pred_spanish)

0.8331038702548689

In [32]:
pd.Series(y_pred_spanish).nunique()

1560

## Vote Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

In [30]:
LinearSVC_text_clf =  make_pipeline_imb([('TfidfVectorizer', TfidfVectorizer(encoding='latin-1',
                                                               max_df=0.90,
                                                               min_df=5,
                                                               max_features=100000,
                                                               #stop_words=list(spacy_stopwords),
                                                              )),
                                     ('SMOTE',SMOTE(random_state=42)),
                                     ('LinearSVC',  LinearSVC())])

In [31]:
bayes_text_clf =  make_pipeline_imb([('CountVectorizer', CountVectorizer(encoding='latin-1',
                                                               max_df=0.90,
                                                               min_df=5,
                                                               max_features=100000,
                                                               #stop_words=list(spacy_stopwords),
                                                              )),
                                     ('SMOTE',SMOTE(random_state=42)),
                                     ('MultinomialNB',  MultinomialNB())])

In [32]:
classifiers = [
    ('MNB', bayes_text_clf),
    ('LinearSVC', LinearSVC_text_clf),
    ]

In [33]:
voting_clf = VotingClassifier(classifiers, n_jobs=1)

In [34]:
voting_classifier=voting_clf.fit(X_train_spanish,y_train_spanish)

In [35]:
y_test_spanish.nunique()

1588

In [36]:
X_test_spanish.shape

(1066605,)

In [37]:
y_pred_spanish = voting_classifier.predict(X_test_spanish.head(X_test_spanish.shape[0]//2))
balanced_accuracy_score(y_test_spanish.head(X_test_spanish.shape[0]//2), y_pred_spanish)

0.8250369244174595

In [38]:
y_pred_spanish = voting_classifier.predict(X_test_spanish.tail(X_test_spanish.shape[0]//2))
balanced_accuracy_score(y_test_spanish.tail(X_test_spanish.shape[0]//2), y_pred_spanish)

0.8254938875574751

In [39]:
pd.Series(y_pred_spanish).nunique()

1588

## SVM Linear TfidfVectorizer SMOTE

In [29]:
LinearSVC_text_clf =  make_pipeline_imb([('TfidfVectorizer', TfidfVectorizer(encoding='latin-1',
                                                               max_df=0.90,
                                                               min_df=5,
                                                               max_features=150000,
                                                               #stop_words=list(spacy_stopwords),
                                                              )),
                                     ('SMOTE',SMOTE(random_state=42)),
                                     ('LinearSVC',  LinearSVC())])

In [30]:
LinearSVC_spanish_clf=LinearSVC_text_clf.fit(X_train_spanish,y_train_spanish)

In [31]:
y_test_spanish.nunique()

1588

In [32]:
X_test_spanish.shape

(1360713,)

In [33]:
y_pred_spanish = LinearSVC_spanish_clf.predict(X_test_spanish.head(X_test_spanish.shape[0]//2))
balanced_accuracy_score(y_test_spanish.head(X_test_spanish.shape[0]//2), y_pred_spanish)

0.8528419605987269

In [34]:
0.849095812843259

0.849095812843259

In [35]:
y_pred_spanish = LinearSVC_spanish_clf.predict(X_test_spanish.tail(X_test_spanish.shape[0]//2))
balanced_accuracy_score(y_test_spanish.tail(X_test_spanish.shape[0]//2), y_pred_spanish)

0.8524735164411547

In [36]:
0.8495081207374371

0.8495081207374371

In [37]:
pd.Series(y_pred_spanish).nunique()

1588

## Save Model

In [38]:
import pickle

In [40]:
filename = 'LinearSVC_tfidf_SMOTE_spanish_clf.sav'
pickle.dump(LinearSVC_spanish_clf, open(filename, 'wb'))

# FastAI