In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
import re 
import nltk
import spacy
import warnings
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sebastian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sebastian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
path = os.getcwd()
full_df = pd.read_csv(f'{path}/df_entrenamiento_cluster.csv', encoding='latin1')

In [3]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953439 entries, 0 to 953438
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Unnamed: 0           953439 non-null  int64  
 1   Id                   953439 non-null  int64  
 2   OwnerUserId          941191 non-null  float64
 3   CreationDate         953439 non-null  object 
 4   Score                953439 non-null  int64  
 5   text                 953439 non-null  object 
 6   label                953439 non-null  int64  
 7   descripcion_cluster  953439 non-null  object 
dtypes: float64(1), int64(4), object(3)
memory usage: 58.2+ MB


In [4]:
full_df.head()

Unnamed: 0.1,Unnamed: 0,Id,OwnerUserId,CreationDate,Score,text,label,descripcion_cluster
0,0,80,26.0,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,12,Problemas de manejar multiples entidades: cone...
1,1,90,58.0,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,6,"Temas generales de desarrollo: branching, scri..."
2,2,120,83.0,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,6,"Temas generales de desarrollo: branching, scri..."
3,3,180,2089740.0,2008-08-01T18:42:19Z,53,Function for creating color wheels,6,"Temas generales de desarrollo: branching, scri..."
4,4,260,91.0,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,6,"Temas generales de desarrollo: branching, scri..."


In [5]:
full_df = full_df.drop('Unnamed: 0', axis=1)
full_df['text'] = full_df['text'].astype(str)
df = full_df.reset_index(drop=True)

In [6]:
def remove_punc(text):
    return text.translate(str.maketrans('','',punc_to_remove))
def remove_stopwords(text, stpwords):
    return ' '.join([word for word in str(text).split() if word not in stpwords])
def remove_freqwords(text):
    return ' '.join([word for word in str(text).split() if word not in freqwords])
def lemmatize_words(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
def stem_words(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])
def remove_numbers(text):
    return re.sub(r"\d+", "", text)
def remove_account(text):
    return re.sub(r"@\w+", "", text)

In [7]:
df['text_lower'] = df['text'].str.lower()
df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,text,label,descripcion_cluster,text_lower
0,80,26.0,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,12,Problemas de manejar multiples entidades: cone...,sqlstatement.execute() - multiple queries in o...
1,90,58.0,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,6,"Temas generales de desarrollo: branching, scri...",good branching and merging tutorials for torto...
2,120,83.0,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,6,"Temas generales de desarrollo: branching, scri...",asp.net site maps
3,180,2089740.0,2008-08-01T18:42:19Z,53,Function for creating color wheels,6,"Temas generales de desarrollo: branching, scri...",function for creating color wheels
4,260,91.0,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,6,"Temas generales de desarrollo: branching, scri...",adding scripting functionality to .net applica...


In [8]:
punc_to_remove= string.punctuation.replace('.','')
df['punc_remove']=df['text_lower'].apply(remove_punc)
df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,text,label,descripcion_cluster,text_lower,punc_remove
0,80,26.0,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,12,Problemas de manejar multiples entidades: cone...,sqlstatement.execute() - multiple queries in o...,sqlstatement.execute multiple queries in one ...
1,90,58.0,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,6,"Temas generales de desarrollo: branching, scri...",good branching and merging tutorials for torto...,good branching and merging tutorials for torto...
2,120,83.0,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,6,"Temas generales de desarrollo: branching, scri...",asp.net site maps,asp.net site maps
3,180,2089740.0,2008-08-01T18:42:19Z,53,Function for creating color wheels,6,"Temas generales de desarrollo: branching, scri...",function for creating color wheels,function for creating color wheels
4,260,91.0,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,6,"Temas generales de desarrollo: branching, scri...",adding scripting functionality to .net applica...,adding scripting functionality to .net applica...


In [9]:
stpwords= set(stopwords.words('english'))
df['stopwords_remove']= df['punc_remove'].apply(lambda text: remove_stopwords(text,stpwords))
df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,text,label,descripcion_cluster,text_lower,punc_remove,stopwords_remove
0,80,26.0,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,12,Problemas de manejar multiples entidades: cone...,sqlstatement.execute() - multiple queries in o...,sqlstatement.execute multiple queries in one ...,sqlstatement.execute multiple queries one stat...
1,90,58.0,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,6,"Temas generales de desarrollo: branching, scri...",good branching and merging tutorials for torto...,good branching and merging tutorials for torto...,good branching merging tutorials tortoisesvn
2,120,83.0,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,6,"Temas generales de desarrollo: branching, scri...",asp.net site maps,asp.net site maps,asp.net site maps
3,180,2089740.0,2008-08-01T18:42:19Z,53,Function for creating color wheels,6,"Temas generales de desarrollo: branching, scri...",function for creating color wheels,function for creating color wheels,function creating color wheels
4,260,91.0,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,6,"Temas generales de desarrollo: branching, scri...",adding scripting functionality to .net applica...,adding scripting functionality to .net applica...,adding scripting functionality .net applications


In [10]:
ct= Counter()

for t in df['stopwords_remove'].values:
   for w in t.split():
        ct[w] +=1

freqwords = set([w for (w, wc) in ct.most_common(10)])
df['freq_word_remove']= df['stopwords_remove'].apply(remove_freqwords)
df.head()


Unnamed: 0,Id,OwnerUserId,CreationDate,Score,text,label,descripcion_cluster,text_lower,punc_remove,stopwords_remove,freq_word_remove
0,80,26.0,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,12,Problemas de manejar multiples entidades: cone...,sqlstatement.execute() - multiple queries in o...,sqlstatement.execute multiple queries in one ...,sqlstatement.execute multiple queries one stat...,sqlstatement.execute multiple queries one stat...
1,90,58.0,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,6,"Temas generales de desarrollo: branching, scri...",good branching and merging tutorials for torto...,good branching and merging tutorials for torto...,good branching merging tutorials tortoisesvn,good branching merging tutorials tortoisesvn
2,120,83.0,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,6,"Temas generales de desarrollo: branching, scri...",asp.net site maps,asp.net site maps,asp.net site maps,asp.net site maps
3,180,2089740.0,2008-08-01T18:42:19Z,53,Function for creating color wheels,6,"Temas generales de desarrollo: branching, scri...",function for creating color wheels,function for creating color wheels,function creating color wheels,function creating color wheels
4,260,91.0,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,6,"Temas generales de desarrollo: branching, scri...",adding scripting functionality to .net applica...,adding scripting functionality to .net applica...,adding scripting functionality .net applications,adding scripting functionality .net applications


In [11]:
lemmatizer= WordNetLemmatizer()
df['text_lemmatize']= df['freq_word_remove'].apply(lemmatize_words)
df.head(-1)

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,text,label,descripcion_cluster,text_lower,punc_remove,stopwords_remove,freq_word_remove,text_lemmatize
0,80,26.0,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,12,Problemas de manejar multiples entidades: cone...,sqlstatement.execute() - multiple queries in o...,sqlstatement.execute multiple queries in one ...,sqlstatement.execute multiple queries one stat...,sqlstatement.execute multiple queries one stat...,sqlstatement.execute multiple query one statement
1,90,58.0,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,6,"Temas generales de desarrollo: branching, scri...",good branching and merging tutorials for torto...,good branching and merging tutorials for torto...,good branching merging tutorials tortoisesvn,good branching merging tutorials tortoisesvn,good branching merging tutorial tortoisesvn
2,120,83.0,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,6,"Temas generales de desarrollo: branching, scri...",asp.net site maps,asp.net site maps,asp.net site maps,asp.net site maps,asp.net site map
3,180,2089740.0,2008-08-01T18:42:19Z,53,Function for creating color wheels,6,"Temas generales de desarrollo: branching, scri...",function for creating color wheels,function for creating color wheels,function creating color wheels,function creating color wheels,function creating color wheel
4,260,91.0,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,6,"Temas generales de desarrollo: branching, scri...",adding scripting functionality to .net applica...,adding scripting functionality to .net applica...,adding scripting functionality .net applications,adding scripting functionality .net applications,adding scripting functionality .net application
...,...,...,...,...,...,...,...,...,...,...,...,...
953433,33112560,1219968.0,2015-10-13T21:02:04Z,2,Why is a thread blocking my JavaFX UI Thread?,6,"Temas generales de desarrollo: branching, scri...",why is a thread blocking my javafx ui thread?,why is a thread blocking my javafx ui thread,thread blocking javafx ui thread,thread blocking javafx ui thread,thread blocking javafx ui thread
953434,33112570,5427938.0,2015-10-13T21:02:41Z,2,GStreamer tutorial on Android getting 'Unsuppo...,6,"Temas generales de desarrollo: branching, scri...",gstreamer tutorial on android getting 'unsuppo...,gstreamer tutorial on android getting unsuppor...,gstreamer tutorial android getting unsupported...,gstreamer tutorial getting unsupported profile...,gstreamer tutorial getting unsupported profile...
953435,33112630,5442641.0,2015-10-13T21:06:46Z,-2,I am getting an undefined index error,6,"Temas generales de desarrollo: branching, scri...",i am getting an undefined index error,i am getting an undefined index error,getting undefined index error,getting undefined index,getting undefined index
953436,33112660,1754127.0,2015-10-13T21:08:36Z,0,Bootstrapping to estimate the mean of a geomet...,6,"Temas generales de desarrollo: branching, scri...",bootstrapping to estimate the mean of a geomet...,bootstrapping to estimate the mean of a geomet...,bootstrapping estimate mean geometric sample,bootstrapping estimate mean geometric sample,bootstrapping estimate mean geometric sample


In [12]:
df['text_without_number']= df['text_lemmatize'].apply(remove_numbers)
df['text_without_acc']= df['text_without_number'].apply(remove_account)
df.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,text,label,descripcion_cluster,text_lower,punc_remove,stopwords_remove,freq_word_remove,text_lemmatize,text_without_number,text_without_acc
0,80,26.0,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,12,Problemas de manejar multiples entidades: cone...,sqlstatement.execute() - multiple queries in o...,sqlstatement.execute multiple queries in one ...,sqlstatement.execute multiple queries one stat...,sqlstatement.execute multiple queries one stat...,sqlstatement.execute multiple query one statement,sqlstatement.execute multiple query one statement,sqlstatement.execute multiple query one statement
1,90,58.0,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,6,"Temas generales de desarrollo: branching, scri...",good branching and merging tutorials for torto...,good branching and merging tutorials for torto...,good branching merging tutorials tortoisesvn,good branching merging tutorials tortoisesvn,good branching merging tutorial tortoisesvn,good branching merging tutorial tortoisesvn,good branching merging tutorial tortoisesvn
2,120,83.0,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,6,"Temas generales de desarrollo: branching, scri...",asp.net site maps,asp.net site maps,asp.net site maps,asp.net site maps,asp.net site map,asp.net site map,asp.net site map
3,180,2089740.0,2008-08-01T18:42:19Z,53,Function for creating color wheels,6,"Temas generales de desarrollo: branching, scri...",function for creating color wheels,function for creating color wheels,function creating color wheels,function creating color wheels,function creating color wheel,function creating color wheel,function creating color wheel
4,260,91.0,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,6,"Temas generales de desarrollo: branching, scri...",adding scripting functionality to .net applica...,adding scripting functionality to .net applica...,adding scripting functionality .net applications,adding scripting functionality .net applications,adding scripting functionality .net application,adding scripting functionality .net application,adding scripting functionality .net application


In [13]:
permitidos = r'^[a-zA-ZáéíóúüñÁÉÍÓÚÜÑ\s¡!¿?,.;:()\'"-]+$'
mask = df['text_without_acc'].str.match(permitidos)
df_filtrado = df[mask].copy()

In [14]:
df_filtrado['text_clean'] = df_filtrado['text_without_acc'].str.replace(r'\s+', ' ', regex=True).str.strip()
df_filtrado = df_filtrado[df_filtrado['text_clean'].str.len() > 0]
df_filtrado = df_filtrado[df_filtrado['text_clean'].str.split().apply(len) >= 3]
df_filtrado.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,text,label,descripcion_cluster,text_lower,punc_remove,stopwords_remove,freq_word_remove,text_lemmatize,text_without_number,text_without_acc,text_clean
0,80,26.0,2008-08-01T13:57:07Z,26,SQLStatement.execute() - multiple queries in o...,12,Problemas de manejar multiples entidades: cone...,sqlstatement.execute() - multiple queries in o...,sqlstatement.execute multiple queries in one ...,sqlstatement.execute multiple queries one stat...,sqlstatement.execute multiple queries one stat...,sqlstatement.execute multiple query one statement,sqlstatement.execute multiple query one statement,sqlstatement.execute multiple query one statement,sqlstatement.execute multiple query one statement
1,90,58.0,2008-08-01T14:41:24Z,144,Good branching and merging tutorials for Torto...,6,"Temas generales de desarrollo: branching, scri...",good branching and merging tutorials for torto...,good branching and merging tutorials for torto...,good branching merging tutorials tortoisesvn,good branching merging tutorials tortoisesvn,good branching merging tutorial tortoisesvn,good branching merging tutorial tortoisesvn,good branching merging tutorial tortoisesvn,good branching merging tutorial tortoisesvn
2,120,83.0,2008-08-01T15:50:08Z,21,ASP.NET Site Maps,6,"Temas generales de desarrollo: branching, scri...",asp.net site maps,asp.net site maps,asp.net site maps,asp.net site maps,asp.net site map,asp.net site map,asp.net site map,asp.net site map
3,180,2089740.0,2008-08-01T18:42:19Z,53,Function for creating color wheels,6,"Temas generales de desarrollo: branching, scri...",function for creating color wheels,function for creating color wheels,function creating color wheels,function creating color wheels,function creating color wheel,function creating color wheel,function creating color wheel,function creating color wheel
4,260,91.0,2008-08-01T23:22:08Z,49,Adding scripting functionality to .NET applica...,6,"Temas generales de desarrollo: branching, scri...",adding scripting functionality to .net applica...,adding scripting functionality to .net applica...,adding scripting functionality .net applications,adding scripting functionality .net applications,adding scripting functionality .net application,adding scripting functionality .net application,adding scripting functionality .net application,adding scripting functionality .net application


In [15]:
df_filtrado.head()

(951403, 15)

In [17]:
X = df_filtrado.drop('label', axis = 1) # Variables predictoras
Y = df_filtrado['label'] #Variable objetivo
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)

In [18]:
X_train.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,text,descripcion_cluster,text_lower,punc_remove,stopwords_remove,freq_word_remove,text_lemmatize,text_without_number,text_without_acc,text_clean
677483,24295570,1119895.0,2014-06-18T21:49:25Z,2,Protractor/Jasmine do not create folder for xu...,"Temas generales de desarrollo: branching, scri...",protractor/jasmine do not create folder for xu...,protractorjasmine do not create folder for xun...,protractorjasmine create folder xunit file exist,protractorjasmine create folder xunit exist,protractorjasmine create folder xunit exist,protractorjasmine create folder xunit exist,protractorjasmine create folder xunit exist,protractorjasmine create folder xunit exist
115376,4997930,198128.0,2011-02-14T22:18:09Z,0,Creating singleton - is this really right?,"Temas generales de desarrollo: branching, scri...",creating singleton - is this really right?,creating singleton is this really right,creating singleton really right,creating singleton really right,creating singleton really right,creating singleton really right,creating singleton really right,creating singleton really right
926402,32262340,2989124.0,2015-08-28T02:15:23Z,0,How to organize test cases for a class with ma...,"Metodos y funciones: extension, reflexion, pas...",how to organize test cases for a class with ma...,how to organize test cases for a class with ma...,organize test cases class many methods,organize test cases class many methods,organize test case class many method,organize test case class many method,organize test case class many method,organize test case class many method
9675,652530,40872.0,2009-03-16T23:29:34Z,12,Neural networks - input values,Problemas y valores por defecto en propiedades...,neural networks - input values,neural networks input values,neural networks input values,neural networks input values,neural network input value,neural network input value,neural network input value,neural network input value
935353,32545260,5200233.0,2015-09-13T00:50:21Z,0,while loop not executing in Python 3.4.3,"Temas generales de desarrollo: branching, scri...",while loop not executing in python 3.4.3,while loop not executing in python 3.4.3,loop executing python 3.4.3,loop executing python 3.4.3,loop executing python 3.4.3,loop executing python ..,loop executing python ..,loop executing python ..


In [24]:
Y_train

677483    6
115376    6
926402    9
9675      1
935353    6
         ..
87601     6
139131    6
625299    6
235194    6
360479    6
Name: label, Length: 665982, dtype: int64

In [73]:
import joblib
vectorizer = joblib.load('vectorizer_tfidf.pkl')
len(vectorizer.vocabulary_)

10432

In [74]:
X_train_vec = vectorizer.transform(X_train['text_clean'])
X_test_vec = vectorizer.transform(X_test['text_clean'])

In [75]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1000, random_state=23)
clf.fit(X_train_vec, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,23
,solver,'lbfgs'
,max_iter,1000


In [76]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test_vec)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98       334
           1       0.98      1.00      0.99     12177
           2       0.99      0.99      0.99      5365
           3       0.99      0.99      0.99     11047
           4       0.99      0.99      0.99      5136
           5       0.99      1.00      0.99     10657
           6       1.00      1.00      1.00    202665
           7       0.99      1.00      0.99      4558
           8       0.99      0.99      0.99      7222
           9       0.99      0.99      0.99      5017
          10       0.99      1.00      0.99      3821
          11       0.99      0.99      0.99      4110
          12       1.00      0.98      0.99      5470
          13       0.99      1.00      0.99      2625
          14       0.99      1.00      0.99      5217

    accuracy                           1.00    285421
   macro avg       0.99      0.99      0.99    285421
weighted avg       1.00   

In [77]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

modelos = {
    "LogReg": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB(alpha=0.1,class_prior=None,fit_prior=True),
    #"RandomForest": RandomForestClassifier(n_estimators=100), No se tuvo en cuenta pues su entrenamiento tarda demasiado tiempo
    "LinearSVM": LinearSVC()
}

resultados = {}

for nombre, modelo in modelos.items():
    modelo.fit(X_train_vec, Y_train)
    y_pred = modelo.predict(X_test_vec)
    resultados[nombre] = {
        "accuracy": accuracy_score(Y_test, y_pred),
        "macro_f1": classification_report(Y_test, y_pred, output_dict=True)['macro avg']['f1-score']
    }
pd.DataFrame(resultados).T.sort_values("macro_f1", ascending=False)

Unnamed: 0,accuracy,macro_f1
LogReg,0.997211,0.992303
LinearSVM,0.996097,0.990993
NaiveBayes,0.902355,0.777037


In [78]:
from sklearn.model_selection import GridSearchCV
nb_model = MultinomialNB()
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
    'fit_prior': [True, False],
    'class_prior': [None, [0.5, 0.5]]
}
grid_search = GridSearchCV(estimator=nb_model, param_grid=param_grid, 
                           scoring='f1_macro',
                           cv=5,
                           n_jobs=-1)

grid_search.fit(X_train_vec, Y_train)

print(f"Mejores hiperparametros encontrados: {grid_search.best_params_}")
print(f"Mejor puntuacion F1: {grid_search.best_score_}")


Mejores hiperparámetros encontrados: {'alpha': 0.1, 'class_prior': None, 'fit_prior': True}
Mejor puntuación F1: 0.7726895351280352


In [79]:
mejor_nombre_modelo = pd.DataFrame(resultados).T.sort_values("macro_f1", ascending=False).index[0]
mejor_modelo_final = modelos[mejor_nombre_modelo]
mejor_modelo_final.fit(X_train_vec, Y_train)
joblib.dump(mejor_modelo_final, 'mejor_modelo.pkl')

['mejor_modelo.pkl']

## Validacion

In [80]:
df_val = pd.read_csv(f'{path}/Questions.csv', encoding='latin1')[1000001:]
df_val['text'] = df_val['Title'].astype(str)
df_val = df_val.reset_index(drop=True)
df_val['text_lower'] = df_val['text'].str.lower()
punc_to_remove= string.punctuation.replace('.','')
df_val['punc_remove']=df_val['text_lower'].apply(remove_punc)
stpwords= set(stopwords.words('english'))
df_val['stopwords_remove']= df_val['punc_remove'].apply(lambda text: remove_stopwords(text,stpwords))
for t in df_val['stopwords_remove'].values:
   for w in t.split():
        ct[w] +=1

freqwords = set([w for (w, wc) in ct.most_common(10)])
df_val['freq_word_remove']= df_val['stopwords_remove'].apply(remove_freqwords)
lemmatizer= WordNetLemmatizer()
df_val['text_lemmatize']= df_val['freq_word_remove'].apply(lemmatize_words)
df_val['text_without_number']= df_val['text_lemmatize'].apply(remove_numbers)
df_val['text_without_acc']= df_val['text_without_number'].apply(remove_account)
permitidos = r'^[a-zA-ZáéíóúüñÁÉÍÓÚÜÑ\s¡!¿?,.;:()\'"-]+$'
mask = df_val['text_without_acc'].str.match(permitidos)
df_val = df_val[mask].copy()
df_val['text_clean'] = df_val['text_without_acc'].str.replace(r'\s+', ' ', regex=True).str.strip()
df_val = df_val[df_val['text_clean'].str.len() > 0]
df_val = df_val[df_val['text_clean'].str.split().apply(len) >= 3]
df_val.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,text,text_lower,punc_remove,stopwords_remove,freq_word_remove,text_lemmatize,text_without_number,text_without_acc,text_clean
0,33112810,5442551.0,2015-10-13T21:18:49Z,,0,how can I set permissions to a batch file mkdi...,<p>I am trying to set up a backup tool for use...,how can I set permissions to a batch file mkdi...,how can i set permissions to a batch file mkdi...,how can i set permissions to a batch file mkdi...,set permissions batch file mkdir creation,set permissions batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation
1,33112960,5344839.0,2015-10-13T21:29:01Z,,0,using a subquery to get results sql,<p>I have a select statement that looks like t...,using a subquery to get results sql,using a subquery to get results sql,using a subquery to get results sql,using subquery get results sql,subquery results sql,subquery result sql,subquery result sql,subquery result sql,subquery result sql
2,33113000,682059.0,2015-10-13T21:31:13Z,,0,Tomcat webapp NoClassDefFoundError with jar on...,<p>Experiencing a NoClassDefFoundError when de...,Tomcat webapp NoClassDefFoundError with jar on...,tomcat webapp noclassdeffounderror with jar on...,tomcat webapp noclassdeffounderror with jar on...,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath
3,33113010,2720343.0,2015-10-13T21:32:09Z,,4,Color error with OpenGL Texture Formats,<p>I'm trying to load an image using FreeImage...,Color error with OpenGL Texture Formats,color error with opengl texture formats,color error with opengl texture formats,color error opengl texture formats,color opengl texture formats,color opengl texture format,color opengl texture format,color opengl texture format,color opengl texture format
5,33113070,5282582.0,2015-10-13T21:36:18Z,,-1,insert into select statment inner join,<p>I have two valuues which I want to insert i...,insert into select statment inner join,insert into select statment inner join,insert into select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join


In [81]:
vectorizer = joblib.load('vectorizer_tfidf.pkl')
modelo = joblib.load('mejor_modelo.pkl')
df_val.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,text,text_lower,punc_remove,stopwords_remove,freq_word_remove,text_lemmatize,text_without_number,text_without_acc,text_clean
0,33112810,5442551.0,2015-10-13T21:18:49Z,,0,how can I set permissions to a batch file mkdi...,<p>I am trying to set up a backup tool for use...,how can I set permissions to a batch file mkdi...,how can i set permissions to a batch file mkdi...,how can i set permissions to a batch file mkdi...,set permissions batch file mkdir creation,set permissions batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation
1,33112960,5344839.0,2015-10-13T21:29:01Z,,0,using a subquery to get results sql,<p>I have a select statement that looks like t...,using a subquery to get results sql,using a subquery to get results sql,using a subquery to get results sql,using subquery get results sql,subquery results sql,subquery result sql,subquery result sql,subquery result sql,subquery result sql
2,33113000,682059.0,2015-10-13T21:31:13Z,,0,Tomcat webapp NoClassDefFoundError with jar on...,<p>Experiencing a NoClassDefFoundError when de...,Tomcat webapp NoClassDefFoundError with jar on...,tomcat webapp noclassdeffounderror with jar on...,tomcat webapp noclassdeffounderror with jar on...,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath
3,33113010,2720343.0,2015-10-13T21:32:09Z,,4,Color error with OpenGL Texture Formats,<p>I'm trying to load an image using FreeImage...,Color error with OpenGL Texture Formats,color error with opengl texture formats,color error with opengl texture formats,color error opengl texture formats,color opengl texture formats,color opengl texture format,color opengl texture format,color opengl texture format,color opengl texture format
5,33113070,5282582.0,2015-10-13T21:36:18Z,,-1,insert into select statment inner join,<p>I have two valuues which I want to insert i...,insert into select statment inner join,insert into select statment inner join,insert into select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join


In [82]:
df_val_vec = vectorizer.transform(df_val['text_clean'])
df_val_pred = modelo.predict(df_val_vec)

In [85]:
df_val['predicciones'] = df_val_pred
df_val.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,text,text_lower,punc_remove,stopwords_remove,freq_word_remove,text_lemmatize,text_without_number,text_without_acc,text_clean,predicciones
0,33112810,5442551.0,2015-10-13T21:18:49Z,,0,how can I set permissions to a batch file mkdi...,<p>I am trying to set up a backup tool for use...,how can I set permissions to a batch file mkdi...,how can i set permissions to a batch file mkdi...,how can i set permissions to a batch file mkdi...,set permissions batch file mkdir creation,set permissions batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation,6
1,33112960,5344839.0,2015-10-13T21:29:01Z,,0,using a subquery to get results sql,<p>I have a select statement that looks like t...,using a subquery to get results sql,using a subquery to get results sql,using a subquery to get results sql,using subquery get results sql,subquery results sql,subquery result sql,subquery result sql,subquery result sql,subquery result sql,6
2,33113000,682059.0,2015-10-13T21:31:13Z,,0,Tomcat webapp NoClassDefFoundError with jar on...,<p>Experiencing a NoClassDefFoundError when de...,Tomcat webapp NoClassDefFoundError with jar on...,tomcat webapp noclassdeffounderror with jar on...,tomcat webapp noclassdeffounderror with jar on...,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,6
3,33113010,2720343.0,2015-10-13T21:32:09Z,,4,Color error with OpenGL Texture Formats,<p>I'm trying to load an image using FreeImage...,Color error with OpenGL Texture Formats,color error with opengl texture formats,color error with opengl texture formats,color error opengl texture formats,color opengl texture formats,color opengl texture format,color opengl texture format,color opengl texture format,color opengl texture format,6
5,33113070,5282582.0,2015-10-13T21:36:18Z,,-1,insert into select statment inner join,<p>I have two valuues which I want to insert i...,insert into select statment inner join,insert into select statment inner join,insert into select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,6


In [87]:
cluster_resumen = {
    0: "Uso de anotaciones en Java, Spring, Hibernate, Struts y serializacion; tambien incluye anotaciones en plataformas moviles como MKMapView.",
    1: "Problemas y valores por defecto en propiedades, debugging en Visual Studio, issues con jQuery, AJAX y almacenamiento de valores.",
    2: "Interaccion con paginas web: flujo de paginas, autorizacion, cache, multiples formularios y manejo de POST en distintas tecnologias.",
    3: "Consultas SQL, HTML, manejo de tablas y normalizacion; incluye manipulacion de DOM y diferencias entre tablas y CSS.",
    4: "Listas en programacion: manipulacion, agrupacion, delegados, estructuras en SharePoint y frameworks como Scala y Python.",
    5: "Operaciones con strings, performance en concatenacion, parsing, analisis de codigo, errores de base de datos, e integracion de frameworks.",
    6: "Temas generales de desarrollo: branching, scripting, unit testing, consumo de servicios, configuracion de proyectos y triggers.",
    7: "Bases de datos: conexion, estructura, testing, engines, ACL en CakePHP, denormalizacion y patrones de diseno de acceso.",
    8: "Consultas SQL en MySQL y errores comunes: replicacion, triggers, stored procedures, logs binarios y sugerencias de autocompletado.",
    9: "Metodos y funciones: extension, reflexion, paso de parametros, testing, seguridad, diseno orientado a objetos y metaprogramacion.",
    10: "Como 'hacer que algo funcione': URLs amigables, CSS, visual studio, controles personalizados, git, SOAP, Flex y accesibilidad.",
    11: "Preguntas sobre 'como funciona' algo: statements preparados, QA, CSS, validacion W3C, threading, SOA, omnicomplete y Scrum.",
    12: "Problemas de manejar multiples entidades: conexiones, versiones, tablas, lenguajes, servidores, layouts, colecciones y parches.",
    13: "Paso de parametros en multiples lenguajes y entornos: SQL, C, C++, Java, Ruby, Oracle, jQuery, funciones y referencias.",
    14: "Errores de 'no funciona': debugging, __doPostBack, CSS, eventos en jQuery, atributos, visualizacion, Mercurial y fechas."
}
df_val['descripcion_cluster'] = df_val['predicciones'].map(cluster_resumen)
df_val.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,text,text_lower,punc_remove,stopwords_remove,freq_word_remove,text_lemmatize,text_without_number,text_without_acc,text_clean,predicciones,descripcion_cluster
0,33112810,5442551.0,2015-10-13T21:18:49Z,,0,how can I set permissions to a batch file mkdi...,<p>I am trying to set up a backup tool for use...,how can I set permissions to a batch file mkdi...,how can i set permissions to a batch file mkdi...,how can i set permissions to a batch file mkdi...,set permissions batch file mkdir creation,set permissions batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation,set permission batch mkdir creation,6,"Temas generales de desarrollo: branching, scri..."
1,33112960,5344839.0,2015-10-13T21:29:01Z,,0,using a subquery to get results sql,<p>I have a select statement that looks like t...,using a subquery to get results sql,using a subquery to get results sql,using a subquery to get results sql,using subquery get results sql,subquery results sql,subquery result sql,subquery result sql,subquery result sql,subquery result sql,6,"Temas generales de desarrollo: branching, scri..."
2,33113000,682059.0,2015-10-13T21:31:13Z,,0,Tomcat webapp NoClassDefFoundError with jar on...,<p>Experiencing a NoClassDefFoundError when de...,Tomcat webapp NoClassDefFoundError with jar on...,tomcat webapp noclassdeffounderror with jar on...,tomcat webapp noclassdeffounderror with jar on...,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,tomcat webapp noclassdeffounderror jar classpath,6,"Temas generales de desarrollo: branching, scri..."
3,33113010,2720343.0,2015-10-13T21:32:09Z,,4,Color error with OpenGL Texture Formats,<p>I'm trying to load an image using FreeImage...,Color error with OpenGL Texture Formats,color error with opengl texture formats,color error with opengl texture formats,color error opengl texture formats,color opengl texture formats,color opengl texture format,color opengl texture format,color opengl texture format,color opengl texture format,6,"Temas generales de desarrollo: branching, scri..."
5,33113070,5282582.0,2015-10-13T21:36:18Z,,-1,insert into select statment inner join,<p>I have two valuues which I want to insert i...,insert into select statment inner join,insert into select statment inner join,insert into select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,insert select statment inner join,6,"Temas generales de desarrollo: branching, scri..."
