# Práctica 5

## Preparación de ambiente

### Carga de módulos

In [1]:
# Data Wrangling
import re
import emoji
import unicodedata
import numpy as np
import pandas as pd
from scipy import sparse
from nltk.corpus import stopwords

# Data Visualization
import cufflinks as cf
from sklearn.decomposition import PCA

# Modelado
from sklearn import set_config
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.mixture import GaussianMixture
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn.model_selection import cross_val_score, train_test_split

# Configuración de ambiente
cf.go_offline()
set_config(display='diagram')
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

### Funciones relevantes

In [2]:
def clean_text(text, pattern="[^a-zA-Z0-9 ]"):
    cleaned_text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
    cleaned_text = re.sub(pattern, " ", cleaned_text.decode("utf-8"), flags=re.UNICODE)
    cleaned_text = u' '.join(cleaned_text.lower().split())
    return cleaned_text

In [3]:
def pivot_categories(df, cluster_column, categories):
    aux = df.copy()
    aux["dum"] = 1
    for i, col in enumerate(categories):
        agg_data = aux[[cluster_column, col, "dum"]].pivot_table(index = col, columns=cluster_column, aggfunc="sum", fill_value=0)
        for col in agg_data:
            agg_data[col] /= agg_data[col].sum()
        if i == 0:
            final = agg_data.copy()
        else:
            final = final.merge(agg_data, left_index = True, right_index = True, how = "inner")
    return final.T

## Data Wrangling

In [4]:
df = pd.read_csv("betsentiment-ES-tweets-sentiment-teams.csv", encoding="latin-1")

In [5]:
df.head()

Unnamed: 0,tweet_date_created,tweet_id,tweet_text,language,sentiment,sentiment_score
0,2018-08-08T13:09:15.489000,1027179935184703489,"Alisson puede estar más tranquilo, no cargará ...",es,POSITIVE,"{""Neutral"":0.082259356975555419921875,""Negativ..."
1,2018-08-08T18:27:37.320000,1027260056092344320,@iPincheViky @ChelseaFC Es que el director eje...,es,NEUTRAL,"{""Neutral"":0.827011644840240478515625,""Negativ..."
2,2018-08-12T14:59:31.520000,1028657238116843520,Upto £100 #freebets &gt; https://t.co/cbjeMXI9...,es,NEUTRAL,"{""Neutral"":0.930982112884521484375,""Negative"":..."
3,2018-08-04T13:23:30.257000,1025733971320160257,"Bobby Duncan, primo de Steven Gerrard, deja la...",es,NEUTRAL,"{""Neutral"":0.906872212886810302734375,""Negativ..."
4,2018-07-28T11:21:06.480000,1023166450981396480,@TorreiraForeva @lepvtron @Arsenal @IntChampio...,es,NEUTRAL,"{""Neutral"":0.942405760288238525390625,""Negativ..."


### Ingeniería de Datos

In [6]:
df[f"len_mensaje"] = df["tweet_text"].str.len()
df[f"n_emojis"] = df["tweet_text"].map(emoji.emoji_count)
df[f"n_lower"] = df["tweet_text"].map(lambda x:sum(map(str.islower, x)))
df[f"n_upper"] = df["tweet_text"].map(lambda x:sum(map(str.isupper, x)))
df[f"n_digit"] = df["tweet_text"].map(lambda x:sum(map(str.isdigit, x)))
df[f"n_whitespaces"] = df["tweet_text"].map(lambda x:len(re.findall("\s", x)))
df[f"n_words"] = df["tweet_text"].str.split(" ").str.len()
df[f"n_urls"] = df["tweet_text"].str.lower().str.contains("http").astype(int)

In [7]:
df

Unnamed: 0,tweet_date_created,tweet_id,tweet_text,language,sentiment,sentiment_score,len_mensaje,n_emojis,n_lower,n_upper,n_digit,n_whitespaces,n_words,n_urls
0,2018-08-08T13:09:15.489000,1027179935184703489,"Alisson puede estar más tranquilo, no cargará ...",es,POSITIVE,"{""Neutral"":0.082259356975555419921875,""Negativ...",152,0,114,6,0,28,29,0
1,2018-08-08T18:27:37.320000,1027260056092344320,@iPincheViky @ChelseaFC Es que el director eje...,es,NEUTRAL,"{""Neutral"":0.827011644840240478515625,""Negativ...",80,0,55,8,0,10,11,0
2,2018-08-12T14:59:31.520000,1028657238116843520,Upto £100 #freebets &gt; https://t.co/cbjeMXI9...,es,NEUTRAL,"{""Neutral"":0.930982112884521484375,""Negative"":...",132,0,91,5,7,10,11,1
3,2018-08-04T13:23:30.257000,1025733971320160257,"Bobby Duncan, primo de Steven Gerrard, deja la...",es,NEUTRAL,"{""Neutral"":0.906872212886810302734375,""Negativ...",121,0,84,11,1,16,17,1
4,2018-07-28T11:21:06.480000,1023166450981396480,@TorreiraForeva @lepvtron @Arsenal @IntChampio...,es,NEUTRAL,"{""Neutral"":0.942405760288238525390625,""Negativ...",82,0,62,8,0,8,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132702,2018-08-10T21:57:11.345000,1028037572671545345,Apunta el diario Mirror que el #ManchesterUnit...,es,NEUTRAL,"{""Neutral"":0.90301716327667236328125,""Negative...",182,0,131,14,2,25,26,1
132703,2018-08-18T04:57:47.448000,1030680133189992448,@andresmarocco @sscnapoli @D_Ospina1 @Arsenal ...,es,NEUTRAL,"{""Neutral"":0.73134517669677734375,""Negative"":0...",288,0,213,15,1,46,47,0
132704,2018-09-16T16:54:09.904000,1041369661399547904,Final del partido @Everton 1 vs @WestHamEspano...,es,NEUTRAL,"{""Neutral"":0.9575507640838623046875,""Negative""...",94,0,58,16,3,12,13,0
132705,2018-07-26T01:58:04.409000,1022299986762129409,"Amistoso, champions o Liga; el city es la perr...",es,NEUTRAL,"{""Neutral"":0.93621218204498291015625,""Negative...",67,0,46,7,0,11,12,0


In [8]:
df["tweet_date_created"].str.len().value_counts()

26    118415
19     14292
Name: tweet_date_created, dtype: int64

In [9]:
df[df["tweet_date_created"].str.len() == 19]

Unnamed: 0,tweet_date_created,tweet_id,tweet_text,language,sentiment,sentiment_score,len_mensaje,n_emojis,n_lower,n_upper,n_digit,n_whitespaces,n_words,n_urls
12,2018-06-13T15:26:34,1006920771372027904,@RodrigoRomano76 @Arsenal @canal10uruguay Soy ...,es,NEUTRAL,"{""Neutral"":0.70216166973114013671875,""Negative...",103,0,76,6,4,12,13,0
14,2018-05-05T18:59:00,992841107514982400,Esperamos su pronta recuperación y nuestras or...,es,NEUTRAL,"{""Neutral"":0.78641760349273681640625,""Negative...",83,0,62,8,0,11,12,0
25,2018-06-16T12:02:50,1007956663914070019,@FabianBecerraG ¿Qué me dice de Pogba? Si el @...,es,NEUTRAL,"{""Neutral"":0.921361446380615234375,""Negative"":...",117,0,79,9,3,20,21,0
29,2018-06-13T00:55:18,1006701511378964481,Vamos!!!! Ya es la recta final Falta poco! A g...,es,NEGATIVE,"{""Neutral"":0.1925411522388458251953125,""Negati...",146,0,88,22,3,17,18,1
30,2018-06-12T18:06:38,1006598666436558849,"@Wolves Ya llegamos tío lobo, saludos a toda l...",es,NEUTRAL,"{""Neutral"":0.676677882671356201171875,""Negativ...",82,0,61,2,0,14,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132665,2018-06-26T16:28:06,1011647300857458689,"#WestHam | TODOS CON VOS, MANU.\n\nSegún el me...",es,NEUTRAL,"{""Neutral"":0.499023258686065673828125,""Negativ...",279,0,196,23,2,48,47,0
132670,2018-05-02T20:39:41,991779278692978693,"Con todo el sufrimiento, no podia ser de otra ...",es,NEUTRAL,"{""Neutral"":0.5908756256103515625,""Negative"":0....",140,0,92,16,0,22,23,0
132671,2018-05-05T17:03:39,992812078841978881,@SouthamptonFC @OriolRomeu Vamos la puta que l...,es,NEGATIVE,"{""Neutral"":0.2715498507022857666015625,""Negati...",94,0,70,6,0,15,16,0
132673,2018-05-09T20:39:30,994315950278275072,FINAL. #Chelsea 1-1 #Huddersfield.\n\nEste emp...,es,NEUTRAL,"{""Neutral"":0.77356016635894775390625,""Negative...",238,0,168,18,2,37,34,0


In [10]:
df["tweet_date_created"] = pd.to_datetime(df["tweet_date_created"].str[:19])

In [11]:
df[["year", "week", "day"]] = df["tweet_date_created"].dt.isocalendar()
df["hour"] = df["tweet_date_created"].dt.hour
df["month"] = df["tweet_date_created"].dt.month

### Limpieza de texto

In [12]:
ls_sw = list(set(map(clean_text, stopwords.words("spanish"))))

In [13]:
df["tweet_text"] = df["tweet_text"].map(clean_text)

In [14]:
df.head()

Unnamed: 0,tweet_date_created,tweet_id,tweet_text,language,sentiment,sentiment_score,len_mensaje,n_emojis,n_lower,n_upper,n_digit,n_whitespaces,n_words,n_urls,year,week,day,hour,month
0,2018-08-08 13:09:15,1027179935184703489,alisson puede estar mas tranquilo no cargara c...,es,POSITIVE,"{""Neutral"":0.082259356975555419921875,""Negativ...",152,0,114,6,0,28,29,0,2018,32,3,13,8
1,2018-08-08 18:27:37,1027260056092344320,ipincheviky chelseafc es que el director ejecu...,es,NEUTRAL,"{""Neutral"":0.827011644840240478515625,""Negativ...",80,0,55,8,0,10,11,0,2018,32,3,18,8
2,2018-08-12 14:59:31,1028657238116843520,upto 100 freebets gt https t co cbjemxi99f lol...,es,NEUTRAL,"{""Neutral"":0.930982112884521484375,""Negative"":...",132,0,91,5,7,10,11,1,2018,32,7,14,8
3,2018-08-04 13:23:30,1025733971320160257,bobby duncan primo de steven gerrard deja la c...,es,NEUTRAL,"{""Neutral"":0.906872212886810302734375,""Negativ...",121,0,84,11,1,16,17,1,2018,31,6,13,8
4,2018-07-28 11:21:06,1023166450981396480,torreiraforeva lepvtron arsenal intchampionscu...,es,NEUTRAL,"{""Neutral"":0.942405760288238525390625,""Negativ...",82,0,62,8,0,8,9,0,2018,30,6,11,7


### Vectorizado

In [15]:
vect = CountVectorizer(stop_words=ls_sw, ngram_range=(1, 5), min_df=10, max_features=100)

In [16]:
vect.fit(df["tweet_text"])

In [17]:
df.head()

Unnamed: 0,tweet_date_created,tweet_id,tweet_text,language,sentiment,sentiment_score,len_mensaje,n_emojis,n_lower,n_upper,n_digit,n_whitespaces,n_words,n_urls,year,week,day,hour,month
0,2018-08-08 13:09:15,1027179935184703489,alisson puede estar mas tranquilo no cargara c...,es,POSITIVE,"{""Neutral"":0.082259356975555419921875,""Negativ...",152,0,114,6,0,28,29,0,2018,32,3,13,8
1,2018-08-08 18:27:37,1027260056092344320,ipincheviky chelseafc es que el director ejecu...,es,NEUTRAL,"{""Neutral"":0.827011644840240478515625,""Negativ...",80,0,55,8,0,10,11,0,2018,32,3,18,8
2,2018-08-12 14:59:31,1028657238116843520,upto 100 freebets gt https t co cbjemxi99f lol...,es,NEUTRAL,"{""Neutral"":0.930982112884521484375,""Negative"":...",132,0,91,5,7,10,11,1,2018,32,7,14,8
3,2018-08-04 13:23:30,1025733971320160257,bobby duncan primo de steven gerrard deja la c...,es,NEUTRAL,"{""Neutral"":0.906872212886810302734375,""Negativ...",121,0,84,11,1,16,17,1,2018,31,6,13,8
4,2018-07-28 11:21:06,1023166450981396480,torreiraforeva lepvtron arsenal intchampionscu...,es,NEUTRAL,"{""Neutral"":0.942405760288238525390625,""Negativ...",82,0,62,8,0,8,9,0,2018,30,6,11,7


In [18]:
df[["len_mensaje", "n_emojis", "n_lower", "n_upper", "n_digit", "n_whitespaces", "n_words", "n_urls", "hour", "week", "day", "month"]].astype(float).to_numpy()

array([[152.,   0., 114., ...,  32.,   3.,   8.],
       [ 80.,   0.,  55., ...,  32.,   3.,   8.],
       [132.,   0.,  91., ...,  32.,   7.,   8.],
       ...,
       [ 94.,   0.,  58., ...,  37.,   7.,   9.],
       [ 67.,   0.,  46., ...,  30.,   4.,   7.],
       [112.,   0.,  75., ...,  31.,   6.,   8.]])

### Creacion de matriz de entrenamiento

In [19]:
df_features = sparse.csr_matrix(df[["len_mensaje", "n_emojis", "n_lower", "n_upper", "n_digit", "n_whitespaces", "n_words", "n_urls", "hour", "week", "day", "month"]].astype(float).to_numpy())

In [20]:
df_features

<132707x12 sparse matrix of type '<class 'numpy.float64'>'
	with 1342530 stored elements in Compressed Sparse Row format>

In [21]:
df_words = vect.transform(df["tweet_text"])

In [22]:
df_words

<132707x100 sparse matrix of type '<class 'numpy.int64'>'
	with 585046 stored elements in Compressed Sparse Row format>

## Modelado supervisado

In [23]:
X = sparse.hstack((df_features, df_words))
y = df["sentiment"]

In [24]:
X

<132707x112 sparse matrix of type '<class 'numpy.float64'>'
	with 1927576 stored elements in COOrdinate format>

### Train-test split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Generación de pipeline

In [26]:
sc = StandardScaler(with_mean=False)

In [27]:
logreg = LogisticRegression(max_iter=1000, n_jobs=-1)

In [28]:
pipe = Pipeline([("sc", sc), ("model", logreg)])

### Entrenamiento

In [29]:
pipe.fit(X_train, y_train)

In [30]:
pipe.score(X_train, y_train)

0.8452024515221541

In [31]:
y.value_counts()

NEUTRAL     111334
POSITIVE     11004
NEGATIVE      9489
MIXED          880
Name: sentiment, dtype: int64

### Cross validation

In [32]:
ls_scores = cross_val_score(X=X_train, y=y_train, cv=4, scoring="accuracy", estimator=pipe, n_jobs=-1)

In [33]:
ls_scores.mean(), ls_scores.std()

(0.8444690107180909, 0.001570827385730246)

In [34]:
pipe.score(X_test, y_test)

0.8470627241763873

## Modelado no supervisado

In [35]:
X = X.todense()

In [36]:
X

matrix([[152.,   0., 114., ...,   0.,   0.,   0.],
        [ 80.,   0.,  55., ...,   0.,   0.,   0.],
        [132.,   0.,  91., ...,   0.,   0.,   0.],
        ...,
        [ 94.,   0.,  58., ...,   0.,   0.,   0.],
        [ 67.,   0.,  46., ...,   0.,   0.,   1.],
        [112.,   0.,  75., ...,   1.,   0.,   0.]])

### Visualización de datos

In [37]:
sc = MinMaxScaler()

In [38]:
Xs = sc.fit_transform(X)

In [39]:
pca = PCA(n_components=0.9)

In [40]:
Xp = pd.DataFrame(data=pca.fit_transform(Xs))

In [41]:
pca.explained_variance_ratio_.cumsum()

array([0.34266023, 0.47779925, 0.59422888, 0.67992507, 0.70583429,
       0.72141984, 0.73238533, 0.74217858, 0.7509957 , 0.7586326 ,
       0.76602023, 0.77259331, 0.77905895, 0.784845  , 0.7903543 ,
       0.79570396, 0.80097007, 0.80616568, 0.81111142, 0.81587404,
       0.82058247, 0.82513801, 0.82957982, 0.83398933, 0.8382704 ,
       0.84242783, 0.84641452, 0.85024291, 0.85389702, 0.85752538,
       0.86105941, 0.86458525, 0.86808098, 0.87156268, 0.87501234,
       0.87835264, 0.8816509 , 0.8849412 , 0.88815624, 0.89134059,
       0.89450894, 0.8976193 , 0.90069234])

In [42]:
pca.n_components_

43

In [43]:
Xp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42
0,-0.4702,0.1013,-0.1966,0.0948,0.0450,-0.0295,0.0969,0.1775,0.0752,-0.0200,0.0273,-0.0121,-0.0020,0.0045,0.0069,0.0060,0.0104,-0.0185,-0.0009,-0.0027,-0.0067,-0.0133,-0.0108,0.0347,0.0067,-0.0162,-0.0428,-0.0027,-0.0172,0.0061,0.0090,-0.0209,0.0026,0.0056,-0.0110,-0.0425,0.0019,-0.0089,0.0073,-0.0078,-0.0151,0.0143,0.0014
1,-0.4805,0.1248,-0.2057,-0.1221,-0.1454,-0.0107,0.0876,0.1740,0.0681,0.0069,0.0491,-0.0144,0.0009,-0.0123,0.0067,-0.0003,-0.0037,-0.0005,0.0049,0.0127,-0.0042,-0.0165,0.0007,0.0232,0.0102,-0.0028,-0.0311,0.0104,-0.0081,-0.0076,0.0001,-0.0075,0.0037,-0.0020,-0.0006,-0.0109,-0.0019,-0.0020,0.0016,0.0001,0.0024,0.0025,-0.0015
2,0.6554,-0.4349,0.1925,-0.0452,-0.1389,-0.0701,-0.0444,0.0299,-0.0010,-0.0215,-0.0478,-0.0280,0.1905,0.0320,0.0038,0.0039,-0.0273,0.0044,-0.0222,-0.0652,-0.0537,-0.0011,0.0022,0.0088,-0.0132,-0.0147,-0.0019,-0.0148,-0.0046,0.0211,0.0192,-0.0160,0.0019,0.0032,-0.0075,-0.0130,-0.0010,-0.0146,0.0077,-0.0003,-0.0057,0.0097,0.0032
3,0.5628,-0.2887,0.1302,0.0209,-0.1145,-0.0581,-0.0210,0.0136,-0.0297,0.0092,-0.0454,0.0252,-0.0662,-0.0350,-0.0354,-0.0241,-0.0338,0.0054,-0.0233,-0.0156,0.0191,0.0212,0.0033,-0.0065,-0.0127,-0.0016,-0.0427,0.0002,0.0203,0.0532,-0.0549,-0.0550,0.0723,-0.0202,0.0305,-0.0145,-0.0104,-0.0036,0.0177,-0.0380,-0.0913,-0.0364,0.0205
4,-0.4772,-0.1986,0.2604,0.1292,-0.1454,-0.0246,-0.0047,-0.0065,-0.0243,0.0003,-0.0193,-0.0208,-0.0042,0.0010,-0.0027,-0.0069,0.0104,-0.0025,-0.0127,-0.0076,0.0026,0.0031,0.0033,-0.0007,-0.0182,-0.0066,0.0049,-0.0051,-0.0155,0.0137,-0.0033,-0.0063,-0.0056,0.0029,-0.0024,-0.0133,0.0006,0.0009,-0.0024,0.0025,0.0113,0.0071,0.0009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132702,0.5722,-0.1241,-0.0066,-0.3028,0.0071,-0.0578,-0.0377,0.0058,0.0021,-0.0272,-0.0357,-0.0583,-0.0119,-0.0673,0.0350,-0.0161,0.2155,0.0228,0.0284,0.0749,-0.0328,-0.0275,0.0168,-0.0864,-0.1083,0.0269,-0.0885,0.0148,-0.0492,0.0371,0.0107,0.0011,-0.0379,-0.0031,-0.0033,-0.0175,-0.0272,0.0176,-0.0092,0.0133,-0.0172,-0.0192,-0.0028
132703,-0.4479,-0.3678,0.0796,0.4220,0.3102,-0.0705,0.0058,0.0069,-0.0147,-0.0353,-0.0599,-0.0153,0.0055,0.0158,-0.0219,0.0135,0.0264,-0.0334,-0.0198,-0.0236,-0.0069,0.0038,-0.0162,0.0040,-0.0205,-0.0294,-0.0090,0.0017,-0.0390,0.0103,0.0069,-0.0292,-0.0097,0.0002,-0.0284,-0.0518,0.0060,-0.0016,-0.0107,-0.0100,-0.0078,0.0068,0.0006
132704,-0.4686,-0.6243,-0.0683,-0.1442,-0.0668,0.2646,0.0127,-0.0397,-0.0737,-0.0709,0.0458,0.0636,0.0228,-0.0423,0.0139,0.0122,0.0081,-0.0312,0.0079,-0.0256,-0.0053,0.0069,0.0032,0.0297,0.0084,0.0195,0.0107,0.0808,0.0528,-0.0812,-0.0167,0.0062,-0.0296,-0.0245,0.0250,-0.0012,-0.1416,-0.0160,0.0041,-0.0697,0.0586,-0.0096,-0.0165
132705,-0.4714,0.0195,0.0874,0.6003,-0.1544,-0.0050,-0.0284,-0.0046,-0.0731,0.0417,-0.0635,0.0214,-0.0162,-0.0510,0.0388,-0.0061,-0.0513,-0.0250,0.0226,0.0261,-0.0249,0.0782,0.0028,-0.0534,0.0647,0.0027,-0.1229,-0.0529,-0.0014,-0.0764,-0.1021,0.0190,0.1015,0.0200,0.0505,-0.0012,-0.0894,-0.1183,-0.0985,0.0708,-0.0628,-0.1154,0.2694


In [44]:
Xp.sample(frac=0.05).iplot(kind="scatter3d", x=0, y=1, z=2, colors=["#296EAA"], mode="markers", theme="solar")

In [45]:
mini_Xp = Xp.sample(frac=0.1)

### Número óptimo de clusters

#### Construcción de grupos (2-10)

In [46]:
df_cl = pd.DataFrame()
for k in range(2, 10):
    print(k)
    mini_Xp = Xp.sample(frac=0.1)
    kmeans = KMeans(n_clusters=k, random_state=10).fit(mini_Xp)
    if k == 3: 
        labels = kmeans.labels_
    clusters = kmeans.labels_
    df_cl.loc[k, "inertia"] = kmeans.inertia_
    df_cl.loc[k, "db"] = davies_bouldin_score(mini_Xp, clusters)
    df_cl.loc[k, "sil"] = silhouette_score(mini_Xp, clusters)

2
3
4
5
6
7
8
9


In [47]:
df_cl

Unnamed: 0,inertia,db,sil
2,6008.141,1.2205,0.3582
3,5478.8317,1.7198,0.2366
4,4951.191,2.0245,0.1713
5,4537.3224,1.8229,0.1848
6,4232.3104,1.7089,0.1934
7,3960.3666,1.6131,0.1991
8,3754.4881,1.5709,0.1991
9,3658.5569,1.7861,0.1661


#### Codo

In [48]:
df_cl["inertia"].iplot(mode="markers+lines", theme="solar", title="Elbow")

#### Davies-Bouldin

In [49]:
df_cl["db"].iplot(mode="markers+lines", theme="solar", title="Davies-Bouldin")

#### Silhouette

In [50]:
df_cl["sil"].iplot(mode="markers+lines", theme="solar", title="Silhouette score")

### Entrenamiento

In [51]:
gmm = GaussianMixture(n_components=4)

In [52]:
features = ["len_mensaje", "n_emojis", "n_lower", "n_upper", "n_digit", "n_whitespaces", "n_words", "n_urls", "hour", "week", "day", "month"]+vect.get_feature_names()

In [53]:
X = pd.DataFrame(data=X, columns=features)

In [54]:
X

Unnamed: 0,len_mensaje,n_emojis,n_lower,n_upper,n_digit,n_whitespaces,n_words,n_urls,hour,week,day,month,2018,afcbournemouth,ahora,ano,anos,aqui,arsenal,asi,barcelona,bien,cfc,champions,championsleague,...,puede,raul,realmadrid,sarri,ser,siempre,solo,spurs,spursofficial,temporada,tiempo,tottenham,tras,united,va,vamos,ver,victoria,vs,watfordfc,westham,westhamutd,wolves,yerry,ynwa
0,152.0000,0.0000,114.0000,6.0000,0.0000,28.0000,29.0000,0.0000,13.0000,32.0000,3.0000,8.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,1.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
1,80.0000,0.0000,55.0000,8.0000,0.0000,10.0000,11.0000,0.0000,18.0000,32.0000,3.0000,8.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,132.0000,0.0000,91.0000,5.0000,7.0000,10.0000,11.0000,1.0000,14.0000,32.0000,7.0000,8.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
3,121.0000,0.0000,84.0000,11.0000,1.0000,16.0000,17.0000,1.0000,13.0000,31.0000,6.0000,8.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
4,82.0000,0.0000,62.0000,8.0000,0.0000,8.0000,9.0000,0.0000,11.0000,30.0000,6.0000,7.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132702,182.0000,0.0000,131.0000,14.0000,2.0000,25.0000,26.0000,1.0000,21.0000,32.0000,5.0000,8.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
132703,288.0000,0.0000,213.0000,15.0000,1.0000,46.0000,47.0000,0.0000,4.0000,33.0000,6.0000,8.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
132704,94.0000,0.0000,58.0000,16.0000,3.0000,12.0000,13.0000,0.0000,16.0000,37.0000,7.0000,9.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
132705,67.0000,0.0000,46.0000,7.0000,0.0000,11.0000,12.0000,0.0000,1.0000,30.0000,4.0000,7.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000


In [55]:
X["cl"] = [str(x) for x in gmm.fit_predict(Xp)]

In [56]:
X["cl"].value_counts(True)

1   0.3434
2   0.2941
3   0.2143
0   0.1482
Name: cl, dtype: float64

In [57]:
X.groupby("cl").mean().style.background_gradient(cmap='Blues')

Unnamed: 0_level_0,len_mensaje,n_emojis,n_lower,n_upper,n_digit,n_whitespaces,n_words,n_urls,hour,week,day,month,2018,afcbournemouth,ahora,ano,anos,aqui,arsenal,asi,barcelona,bien,cfc,champions,championsleague,chelsea,chelseafc,city,club,co,dia,dos,equipo,everton,fcbarcelona,fecha,fichaje,fichajes,final,fulhamfc,futbol,gol,goles,gracias,gran,hace,hazard,hoy,https,https co,icc2018,inglaterra,inter,juego,jugador,jugadores,jugar,kepa,lcfc,league,lfc,liverpool,llega,madrid,manchester,manchestercity,manchesterunited,mancity,manutd,mejor,millones,mina,minutos,mourinho,mufc,nuevo,nuevo jugador,nufc,oficial,partido,partidos,portero,premier,premier league,premierleague,primer,psg,puede,raul,realmadrid,sarri,ser,siempre,solo,spurs,spursofficial,temporada,tiempo,tottenham,tras,united,va,vamos,ver,victoria,vs,watfordfc,westham,westhamutd,wolves,yerry,ynwa
cl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1
0,188.624422,0.000661,121.766312,20.336673,4.189493,26.496008,25.74958,1.0,14.735544,30.835427,3.797183,7.544169,0.033057,0.0,0.052942,0.021868,0.070742,0.023394,0.09851,0.063825,0.047856,0.010578,0.014037,0.063825,0.127803,0.110258,0.089661,0.019275,0.044296,1.147841,0.022123,0.065911,0.065911,0.119158,0.063266,0.011443,0.033871,0.051518,0.063317,0.016427,0.060876,0.033667,0.042262,0.011239,0.027259,0.05223,0.019376,0.044907,1.147841,1.14718,0.016681,0.023242,0.054468,0.017342,0.128363,0.019071,0.014087,0.017851,0.0,0.036973,0.156182,0.079744,0.096221,0.032447,0.028938,0.032599,0.056197,0.054163,0.090576,0.025225,0.04689,0.08366,0.01424,0.020241,0.019122,0.174236,0.083253,0.01602,0.131618,0.063266,0.023242,0.034481,0.033108,0.018308,0.101663,0.028327,0.063724,0.009866,0.00712,0.156182,0.022733,0.025225,0.008798,0.043737,0.018003,0.10624,0.064537,0.017851,0.061384,0.030463,0.023852,0.021563,0.01007,0.041804,0.022123,0.067589,0.018563,0.011901,0.018003,0.01485,0.071708,0.041042
1,134.175766,0.000614,91.776881,12.079939,1.837049,20.049438,20.394321,0.0,14.651475,30.597823,4.036514,7.521088,0.00373,0.0,0.030216,0.021834,0.020649,0.0,0.13155,0.027692,0.008097,0.035153,0.020188,0.01953,0.045006,0.054068,0.180242,0.010928,0.026069,0.0,0.014088,0.0,0.070877,0.115883,0.085052,0.006868,0.008755,0.009896,0.034166,0.027978,0.024906,0.0,0.015689,0.020473,0.0,0.017247,0.015668,0.026705,0.0,0.0,0.0,0.007175,0.021351,0.012332,0.025893,0.022031,0.020671,0.020627,0.0,0.0,0.169622,0.034978,0.0,0.028526,0.003357,0.0,0.0,0.075924,0.154832,0.046037,0.015273,0.015273,0.00994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016545,0.011805,0.0,0.0,0.0,0.0,0.036777,0.020144,0.0,0.068112,0.009896,0.03421,0.023852,0.032103,0.008251,0.048912,0.026551,0.0,0.0,0.004498,0.006824,0.038576,0.023589,0.032081,0.004345,0.031313,0.030172,0.0,0.036909,0.0,0.008514,0.025915
2,175.614979,0.000717,111.737708,19.945451,4.62884,23.624484,22.9485,1.0,14.352738,30.973379,4.381204,7.58249,0.053345,0.023444,0.000794,0.012068,0.025724,0.023598,0.171667,0.000846,0.018807,0.010633,0.030388,5.1e-05,7.7e-05,0.109021,0.07971,0.040047,0.028415,1.183889,0.023034,0.001051,0.052781,0.074381,0.023418,0.030516,0.016859,0.024828,0.035076,0.013452,0.069256,0.051116,0.051346,0.012862,0.026724,0.000922,0.016706,0.049655,1.183146,1.181737,0.031899,0.025186,0.007943,0.01481,0.037587,0.017884,0.010633,0.012452,0.02534,0.032335,0.094263,0.044992,0.002024,0.013093,0.047554,0.056445,0.062056,0.093776,0.085116,0.025135,0.020036,0.000282,0.017833,0.026519,0.049194,0.007917,0.00287,0.026647,0.002947,0.080965,0.021317,0.013067,0.046811,0.029517,0.194138,0.037306,7.7e-05,0.007533,0.020318,7.7e-05,0.018755,0.016654,0.007712,0.000179,0.023316,0.000641,0.060032,0.021471,0.057419,0.026929,0.03359,0.015296,0.011325,0.000641,0.031566,0.0752,0.023495,0.029363,0.030465,0.032514,0.000179,0.000615
3,155.968498,0.000105,106.951939,13.543051,2.347924,24.22255,24.228105,0.0,14.760609,31.208593,4.50712,7.660373,0.011743,0.048061,0.033646,0.022958,0.02665,0.022923,0.122069,0.025595,0.020778,0.037865,0.011567,0.028091,0.030798,0.069648,0.081602,0.028724,0.029427,0.000914,0.015821,0.052948,0.08568,0.088563,0.031853,0.017333,0.014134,0.017157,0.054952,0.021693,0.036353,0.095278,0.034666,0.017192,0.07675,0.028689,0.010794,0.047358,0.0,0.0,0.016243,0.015364,0.024716,0.024013,0.047815,0.02788,0.022571,0.011462,0.036459,0.038604,0.104595,0.061175,0.01881,0.021903,0.033435,0.022361,0.043983,0.069437,0.111978,0.05126,0.015294,0.012024,0.025911,0.036775,0.059206,0.044299,0.009282,0.056851,0.029322,0.151637,0.025595,0.015927,0.069613,0.020919,0.228914,0.045494,0.020321,0.023239,0.048483,0.035474,0.013606,0.037021,0.023907,0.031502,0.020251,0.040959,0.056147,0.057308,0.073621,0.019794,0.040291,0.040045,0.027001,0.035334,0.015751,0.069754,0.020919,0.024048,0.0334,0.082586,0.007629,0.018704


In [58]:
y = y.to_frame()

In [59]:
y["cl"] = X["cl"]

In [60]:
pivot_categories(df=y, cluster_column="cl", categories=["sentiment"]).style.background_gradient(cmap='Blues')

Unnamed: 0_level_0,sentiment,MIXED,NEGATIVE,NEUTRAL,POSITIVE
Unnamed: 0_level_1,cl,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dum,0,0.00239,0.023292,0.921121,0.053196
dum,1,0.010357,0.123256,0.759501,0.106886
dum,2,0.001435,0.022265,0.925568,0.050732
dum,3,0.010723,0.089477,0.790564,0.109236


In [61]:
dc_cl = {1: "Neutral", 0: "Negativos concretos", 3: "Champions neutral", 2: "Narración de todo"}

In [62]:
vect = vect.get_feature_names() 

In [63]:
pd.to_pickle?

In [64]:
pd.to_pickle(pipe, "./tweets_models/pipe.xz")
pd.to_pickle(sc, "./tweets_models/sc.xz")
pd.to_pickle(pca, "./tweets_models/pca.xz")
pd.to_pickle(gmm, "./tweets_models/gmm.xz")
pd.to_pickle(features, "./tweets_models/features.xz")
pd.to_pickle(vect, "./tweets_models/vect.xz")

FileNotFoundError: [Errno 2] No such file or directory: './tweets_models/pipe.xz'

In [None]:
from pandas import read_pickle, to_datetime, DataFrame
from scipy.sparse import csr_matrix, hstack
from datetime import datetime

In [None]:
def vectorizer(serie, text_features):
    ls_vect = []
    for text in serie:
        dc_word = {x: int(x in text) for x in text_features}
        ls_vect.append(dc_word)
    vect = csr_matrix(DataFrame(ls_vect))
    return vect

In [None]:
def load_objects():
    model = read_pickle("./tweets_models/pipe.xz")
    sc = read_pickle("./tweets_models/sc.xz")
    pca = read_pickle("./tweets_models/pca.xz")
    gmm = read_pickle("./tweets_models/gmm.xz")
    features = read_pickle("./tweets_models/features.xz")
    vect = read_pickle("./tweets_models/vect.xz")
    return model, sc, pca, gmm, features, vect

In [None]:
def etl(X, vect):
    X[f"len_mensaje"] = X["tweet_text"].str.len()
    X[f"n_emojis"] = X["tweet_text"].map(emoji.emoji_count)
    X[f"n_lower"] = X["tweet_text"].map(lambda x:sum(map(str.islower, x)))
    X[f"n_upper"] = X["tweet_text"].map(lambda x:sum(map(str.isupper, x)))
    X[f"n_digit"] = X["tweet_text"].map(lambda x:sum(map(str.isdigit, x)))
    X[f"n_whitespaces"] = X["tweet_text"].map(lambda x:len(re.findall("\s", x)))
    X[f"n_words"] = X["tweet_text"].str.split(" ").str.len()
    X[f"n_urls"] = X["tweet_text"].str.lower().str.contains("http").astype(int)
    X["tweet_date_created"] = to_datetime(X["tweet_date_created"].str[:19])
    X[["year", "week", "day"]] = X["tweet_date_created"].dt.isocalendar()
    X["hour"] = X["tweet_date_created"].dt.hour
    X["month"] = X["tweet_date_created"].dt.month
    X["tweet_text"] = X["tweet_text"].map(clean_text)
    X_features = csr_matrix(X[["len_mensaje", "n_emojis", "n_lower", "n_upper", "n_digit", "n_whitespaces", "n_words", "n_urls", "hour", "week", "day", "month"]].astype(float).to_numpy())
    X_words = vectorizer(X["tweet_text"], vect)
    X = hstack((X_features, X_words))
    return X

In [None]:
def predict(X, model):
    probas = {f"proba_{x.lower()}": y for x, y in zip(model[-1].classes_, model.predict_proba(X)[0])}
    predict = {"class": model.predict(X)[0]}
    return probas, predict

In [None]:
def predict_cluster(X, sc, pca, features, gmm, dc_cl):
    X = DataFrame(data = sc.transform(X.todense()), columns=features)
    Xp = DataFrame(data=pca.transform(X))
    cluster = gmm.predict(Xp)[0]
    return {"cluster": dc_cl[cluster]}

In [None]:
model, sc, pca, gmm, features, vect = load_objects()

In [None]:
df.loc[[0], ["tweet_text", "tweet_date_created"]]

In [None]:
X = etl(df.loc[[0], ["tweet_text", "tweet_date_created"]].astype(str), vect)

In [None]:
X

In [None]:
probas, predict = predict(X, model)

In [None]:
model.predict_proba(X)[0]

In [None]:
list(zip(model.classes_, model.predict_proba(X)[0]))

In [None]:
model.predict_proba(X)

In [None]:
probas

In [None]:
dc_cl = {1: "Neutral", 0: "Negativos concretos", 3: "Champions neutral", 2: "Narración de todo"}

In [None]:
cluster = predict_cluster(X, sc, pca, features, gmm, dc_cl)

In [None]:
cluster

In [None]:
response = {"datetime": datetime.now().strftime("%d/%m/%YT%H:%M:%S"), "team_name": "professor"}

In [None]:
response = {**response, **probas, **predict, **cluster}

In [None]:
response

In [None]:
# url = "https://p0aw9kkmjl.execute-api.us-west-2.amazonaws.com/Prod/hello/"
url ="https://xvmu0vkpz2.execute-api.us-west-2.amazonaws.com/Prod/hello/"
url = "https://uju270qbw1.execute-api.us-west-2.amazonaws.com/Prod/hello/"

In [None]:
import requests

In [None]:
response = requests.post(url, json={"tweet_text": '@IrisCisneros_ @Argentina Mejor que Romero? ???? Posiblemente por continuidad de juego pero no en calidad.'})

In [None]:
response

In [None]:
response.json()

In [None]:
df.loc[0, "tweet_text"]

In [None]:
validate = pd.read_csv("/home/oscar/Escritorio/betsentiment-ES-tweets-sentiment-worldcup.csv", encoding="latin-1")

In [None]:
validate = validate.groupby('sentiment', group_keys=False).apply(lambda x: x.sample(min(len(x), 25), random_state=123)).sample(frac=1).reset_index(drop=True)

In [None]:
validate

In [None]:
validate.loc[1, "tweet_text"]

In [None]:
validate.to_csv("tweets_validate.csv", index=False)