In [1]:
from tweet_sent_predictor.predictor.LanguagePredictor import LanguagePredictor
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("tweet_sent_predictor/data/train_proper.csv")
X = df["body"]
y = df["opinion"].values

In [3]:
X, y

(0                   20 min line @apple store @short pump.
 1       Nueva tecnología convierte cualquier superfici...
 2       Some people should not post replies in #Google...
 3       I know a few others having same issue RT @Joel...
 4       #Microsoft - We put the ""backwards"" into bac...
                               ...                        
 4168    fuck this see you hoes @ work @WeakTwip @Munnn...
 4169    #Microsoft, #Adobe lose $13.5bn to piracy: Rep...
 4170    I tried to explain why you would do ""The #Twi...
 4171    Installed io5 - fine on ipad but wiped wife's ...
 4172    #microsoft #careers site is giving errors for ...
 Name: body, Length: 4173, dtype: object,
 array(['neu', 'irr', 'neu', ..., 'neu', 'neg', 'neg'], dtype=object))

In [4]:
lp = LanguagePredictor(nbpass=3)

In [5]:
lang_preds = lp.predict(X)

In [6]:
y_true = []
y_pred = []
for i in range(len(lang_preds)):
    if lang_preds[i] != "en":
        y_pred.append("irr")
    else:
        y_pred.append("oth")
        
    if y[i] == "irr":
        y_true.append("irr")
    else:
        y_true.append("oth")
        
y_true = np.array(y_true)
y_pred = np.array(y_pred)

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

- La case Haut gauche correspond aux messages irrelevant capturés = tweets en langue étrangère détectés + tweets anglais irrelevant filtrés par erreur (mais ça reste compté comme bon)
- Haut droite correspond aux messages irrelevants anglais qu'on a laissé passer (good) + des messages étrangers qu'on aurait laissé passer (bad)
- Bas gauche correspond aux messages non étrangers qu'on a bloqué (bad)
- Bas droite correspond aux messages anglais qu'on a laissé passer (good)

In [7]:
labels=["irr", "oth"]
index=labels
conf = confusion_matrix(y_true, y_pred, labels=labels, normalize='true')
confusion_df = pd.DataFrame(conf, index=labels, columns=labels)

In [8]:
confusion_df

Unnamed: 0,irr,oth
irr,0.792657,0.207343
oth,0.040589,0.959411


Les messages affichés ne sont pas des erreurs s'il s'agit de tweets anglais

In [9]:
fautes_idx = np.where(y_true != y_pred)[0]

In [10]:
fautes_idx.shape

(401,)

In [11]:
for i in fautes_idx:
    print(X[i])
    print(lang_preds[i])

Buenas noches a todos #Twitter off
en
@apple..good ipad
hr
@iley_little we'lll figure it out tm in gym #google
en
Steve Ballmer slams Android #microsoft #steveballmer http://t.co/wfSSexks
da
Android Beam: Compartilhamentovia NFC #icecreamsandwich #galaxynexus #google #samsung
en
http://t.co/Qewndku6 #Nokia workers ask,is CEO Stephen Elop a #Microsoft mole??
en
AsiaClassifiedToday: SE Asia and Singapore lead Asia Pacific region in business travel - Forimmedi... http://t.co/Jdvas9rH #asia #google
en
http://t.co/e5ClGzsI #google
so
Nueva #Consola #Xbox en curso: #LinkedIn http://t.co/VUzmZEzv #Microsoft #Xbox360 #Xbox720 #Twitter via @ZthaeDigital
en
#Adomani #Twitter
en
Sex Games http://t.co/5qtmLHJn #@JaylaStarr #@JCannonDSD #sex #Twitter #video
en
#Samsung, #Google Unveil Phone http://t.co/hOB37hbO
de
sail phoenix next stop----&gt; after quimera rock @apple house
en
#twitter se me hiso vicio
en
Mention #Twitter and get a Pittsburgh HVAC and Electrical systems Checkup for $129 Contact h

On voit que le prédicteur se trompe sur pas mal de tweets:
- Les hashtags semblent unfluencer la langue finale (enlever les tweets ?)
- il est possible que les mentions produisent le même problème
- Les fautes et abbréviations sont mal interprétées (exemple Good night = GN -> pas anglais)

## Test en enlevant les mentions et hashtags

In [13]:
from tweet_sent_predictor.transformer.MentionFilter import MentionFilter
from tweet_sent_predictor.transformer.HashtagFilter import HashtagFilter

In [14]:
pipe = Pipeline([
    ("retirer mention", MentionFilter()),
    ("retirer hash", HashtagFilter())
])

In [15]:
X_clean = pipe.fit_transform(X)

In [16]:
np.where(X_clean == "")

(array([53]),)

In [17]:
lp_clean = LanguagePredictor(nbpass=3)
lang_preds_clean = lp_clean.predict(X_clean)

y_true = []
y_pred = []
for i in range(len(lang_preds)):
    if lang_preds_clean[i] != "en":
        y_pred.append("irr")
    else:
        y_pred.append("oth")
        
    if y[i] == "irr":
        y_true.append("irr")
    else:
        y_true.append("oth")
        
y_true = np.array(y_true)
y_pred = np.array(y_pred)

In [18]:
labels=["irr", "oth"]
index=labels
conf = confusion_matrix(y_true, y_pred, labels=labels, normalize='true')
confusion_df = pd.DataFrame(conf, index=labels, columns=labels)

In [19]:
confusion_df

Unnamed: 0,irr,oth
irr,0.87041,0.12959
oth,0.086566,0.913434


In [20]:
fautes_idx_clean = np.where(y_true != y_pred)[0]

In [21]:
fautes_idx_clean.shape

(421,)

In [22]:
for i in fautes_idx_clean:
    print(X_clean[i])
    print(lang_preds[i])

20 min line  store  pump.
en
..good ipad
hr
 we'lll figure it out tm in gym 
en
Steve Ballmer slams Android   http://t.co/wfSSexks
da
http://t.co/Qewndku6  workers ask,is CEO Stephen Elop a  mole??
en
Surfactants pdf ebook: http://t.co/JlzvsR9f 
af
 '
en
AsiaClassifiedToday: SE Asia and Singapore lead Asia Pacific region in business travel - Forimmedi... http://t.co/Jdvas9rH  
en
http://t.co/e5ClGzsI 
so

en
,  Unveil Phone http://t.co/hOB37hbO
de
sail phoenix next stop----&gt; after quimera rock  house
en
Are you an digital expert? Or an online amateur? - http://t.co/oizGNXGT 
en
bye bye  ;)
en
RT : Steve Ballmer slams Android   http://t.co/GQqyQbaG 
en
I OUT  Till 2mar
en
Mention  and get a Pittsburgh HVAC and Electrical systems Checkup for $129 Contact http://t.co/KI7JJulI
en
    experience  2  insider  2 hacks  test team http://t.co/uk8sW8HZ Halo 2 Hacks
en
RT : MI NOVIA Y COSTUMBRE DE DEJARME DE SEGUIR EN  JAJJAA
en
 Newest Texas Holdem Zynga Poker Hack Cheat Update 18 October 201

### comparaisons

In [23]:
for i in fautes_idx:
    print(X[i])
    print(X_clean[i])
    print(lang_preds[i])
    print(lang_preds_clean[i])
    print()

Buenas noches a todos #Twitter off
Buenas noches a todos  off
en
es

@apple..good ipad
..good ipad
hr
hr

@iley_little we'lll figure it out tm in gym #google
 we'lll figure it out tm in gym 
en
en

Steve Ballmer slams Android #microsoft #steveballmer http://t.co/wfSSexks
Steve Ballmer slams Android   http://t.co/wfSSexks
da
sv

Android Beam: Compartilhamentovia NFC #icecreamsandwich #galaxynexus #google #samsung
Android Beam: Compartilhamentovia NFC    
en
pt

http://t.co/Qewndku6 #Nokia workers ask,is CEO Stephen Elop a #Microsoft mole??
http://t.co/Qewndku6  workers ask,is CEO Stephen Elop a  mole??
en
en

AsiaClassifiedToday: SE Asia and Singapore lead Asia Pacific region in business travel - Forimmedi... http://t.co/Jdvas9rH #asia #google
AsiaClassifiedToday: SE Asia and Singapore lead Asia Pacific region in business travel - Forimmedi... http://t.co/Jdvas9rH  
en
en

http://t.co/e5ClGzsI #google
http://t.co/e5ClGzsI 
so
??

Nueva #Consola #Xbox en curso: #LinkedIn http://t.co/VUzm