# Procesamiento del Lenguaje Natural y Aprendizaje Supervisado

### Importar librerias

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Leer los datos

In [2]:
datos = pd.read_csv('airline_tweets.csv')

In [3]:
datos.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [None]:
datos.info()

In [None]:
sns.countplot(data=datos,x='airline',hue='airline_sentiment')

In [None]:
sns.countplot(data=datos,x='negativereason')
plt.xticks(rotation=90);

In [None]:
sns.countplot(data=datos,x='airline_sentiment')

In [None]:
datos['airline_sentiment'].value_counts()

## Características y etiquetas

In [None]:
documentos = datos[['airline_sentiment','text']]

In [None]:
documentos.head()

In [None]:
y = datos['airline_sentiment']
X = datos['text']

### División en conjunto de entrenamiento y prueba

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.20,
                                                   random_state=101)

### Vectorización

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
tfidf.fit(X_train)

In [None]:
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
X_train_tfidf

### Comparación de Modelos: Naive Bayes, Regresión Logística, LinearSVC

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf,y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(max_iter = 1000)
log.fit(X_train_tfidf,y_train)

In [None]:
from sklearn.svm import LinearSVC
svc = LinearSVC()
svc.fit(X_train_tfidf,y_train)

### Evaluación

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

In [None]:
def reporte(modelo):
    preds = modelo.predict(X_test_tfidf)
    print(classification_report(y_test,preds))
    plot_confusion_matrix(modelo,X_test_tfidf,y_test)

In [None]:
print("Modelo Naive Bayes")
reporte(nb)

In [None]:
print("Regresión Logística")
reporte(log)

In [None]:
print('Clasificador con Vectores de Soporte')
reporte(svc)

### Pipeline para nuevos tweets

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([('tfidf',TfidfVectorizer()),
                 ('svc',LinearSVC())])

In [None]:
pipe.fit(X,y)

In [None]:
nuevo_tweet = ['good flight']
pipe.predict(nuevo_tweet)

In [None]:
nuevo_tweet = ['bad flight']
pipe.predict(nuevo_tweet)

In [None]:
nuevo_tweet = ['ok flight']
pipe.predict(nuevo_tweet)