# NLP of tweets 

In [None]:
!pip install openml

In [2]:
import nltk
import pandas as pd
import re
import numpy as np
import math
import openml
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,Activation
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.metrics import classification_report

##Load Data

In [3]:
#https://www.openml.org/search?type=data&status=active&id=43397

data = openml.datasets.get_dataset(43397)
print(
    f"Dataset of '{data.name}'"
)
print(f"URL: {data.url}")



Dataset of 'Airlines-Tweets-Sentiments'
URL: https://api.openml.org/data/v1/download/22102222/Airlines-Tweets-Sentiments.arff


In [4]:
X, y, categorical_indicator, attributes = data.get_data(
    dataset_format="dataframe", target=data.default_target_attribute
)
dataset = pd.DataFrame(X, columns=attributes)
dataset

Unnamed: 0,_id,tweet_text,tweet_lang,tweet_sentiment_value
0,595e60b48fcd022a715f7b7b,this airfrance b777-300er has the oldest ifes ...,en,0
1,595e60de8fcd022a715f7b7d,???? will miss my connection airfrance https:/...,en,0
2,595e61448fcd022a715f7b7f,"airfrance lost luggage in overhead cabin, emai...",en,0
3,595e62748fcd022a715f7b83,"here's a new twist on the ""all airlines hate m...",en,0
4,595e62b28fcd022a715f7b86,airfrance so now i might not have 3 pieces of ...,en,0
...,...,...,...,...
1092,596b9c77976f440300c0f913,airfrance thanks in rome,en,2
1093,596be744976f440300c0f9b7,"rakiichak you are flying the wrong airline, ai...",en,2
1094,596c5d0f976f440300c0fac6,kislanykim airfrance i has a similar experienc...,en,2
1095,596c9962976f440300c0fc90,nothing tops when you check the air france mus...,en,2


In [5]:
Y = X['tweet_sentiment_value']

##Pre processing

In [57]:
X['tweet_text'] = X['tweet_text'].apply(lambda x: ' '.join([i for i in x.split() if len(i)>3]))

In [58]:
texts = X['tweet_text']
texts

0       this airfrance b777-300er oldest ifes i've eve...
1       ???? will miss connection airfrance https://t....
2       airfrance lost luggage overhead cabin, email r...
3       here's twist "all airlines hate musicians" sag...
4       airfrance might have pieces most important gea...
                              ...                        
1092                                airfrance thanks rome
1093    rakiichak flying wrong airline, france sensati...
1094    kislanykim airfrance similar experience with t...
1095    nothing tops when check france music selection...
1096    ??la gastronomie...in afrofoodie ??air france ...
Name: tweet_text, Length: 1097, dtype: object

In [59]:
texts[1]

'???? will miss connection airfrance https://t.co/2olmtwcxyk'

In [60]:
def preprocessing(texts):
  t = re.sub(r'#','',texts)
  t = re.sub(r'hahaha','',texts)
  t = re.sub(r'\n','',texts)
  t = t.lower()
  t = re.sub(r'https:\/\/.*|http:\/\/.*','',texts)
  t = re.sub(r'[\,\+\=\-\_\?\[\]\{\}\@\.\'\$\!\&\>\<\;\"\(\)\:\/\\]+','',t)
  t = re.sub(r'[0-9]+','',t)
  return t

Remove stopwords

In [61]:
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
stop_list_nltk = nltk.corpus.stopwords.words('english')

new_texts = []
for t in texts:
    if t not in stop_list_nltk:
        new_texts.append(t)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [62]:
vocabulary = {}

for i in range(0,len(new_texts)):
    new_texts[i] = new_texts[i].lower()
    for j in new_texts[i].split():
        if j not in vocabulary:
            vocabulary[j] = 1
        else:
            vocabulary[j]+=1

In [63]:
vocab_size = len(vocabulary)

Tokenization

In [64]:
tokenizer = Tokenizer(num_words = vocab_size,lower=True)
tokenizer.fit_on_texts(new_texts)

train_sequences = tokenizer.texts_to_sequences(new_texts)

In [None]:
train_sequences

Lemmatizing

In [65]:
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer

corpus = []
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
texts = [lemmatizer.lemmatize(w) for w in texts]

for text in new_texts:
  ptext = preprocessing(text)
  corpus.append(ptext)
corpus

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['this airfrance ber oldest ifes ive ever seen belongs museum terrible smell isnt helping either',
 ' will miss connection airfrance ',
 'airfrance lost luggage overhead cabin email response phone answers help',
 'heres twist all airlines hate musicians saga gear plane amp airfrance have clue where is',
 'airfrance might have pieces most important gear most important dublin show ive ever done thanks airfrance',
 'airfrance lose gear plane like whats thought process there its literally feet its plane',
 'reminded hate flying airfrance cdg crap information provision amp enough seats gates',
 'airfrance charges  call them weve been hold minutes this normal france',
 'airfrance delayed flight adam still says itll boarding   explain',
 'airfrance made flin horrible condition dont even reply complaint what shame',
 'airfrance made horrible condition your partner aireuropa dont even reply complaint what shame',
 'airfrance made horrible condition your partner aireuropa dont even reply complai

In [66]:
corpus[1]

' will miss connection airfrance '

##Feature extraction

Bag-of-Words (BoW)

In [67]:
bag = " ".join(corpus)

In [68]:
tokens = bag.split()
vocab = sorted(set(tokens))

In [None]:
vocab

In [142]:
count_words = {}

for doc in corpus:
    words= nltk.word_tokenize(doc)
    for i in words:
        if i in count_words.keys():
            count_words[i] += 1
        else:
            count_words[i] = 1

Using Bag-of-Words wee can see which words are more used in the tweets

In [143]:
import heapq
from sklearn import tree

freq_words = heapq.nlargest(50,count_words, key=count_words.get)
freq_words

['airfrance',
 'france',
 'flight',
 'have',
 'with',
 'this',
 'your',
 'from',
 'delta',
 'paris',
 'that',
 'luggage',
 'they',
 'amp',
 'been',
 'service',
 'still',
 'thanks',
 'will',
 'airline',
 'thank',
 'customer',
 'dont',
 'hours',
 'when',
 'flying',
 'were',
 'check',
 'airport',
 'help',
 'what',
 'time',
 'a',
 'never',
 'just',
 'plane',
 'experience',
 'about',
 'lost',
 'there',
 'delayed',
 'business',
 'over',
 'baggage',
 'af',
 'please',
 'after',
 'its',
 'staff',
 'only']

Inverse Document Frequency (TF-IDF)

In [147]:
words_freq = {}
pondered_freq = {}
N = {} 

for i,j in enumerate(corpus):
    words_freq[i] = dict()
    pondered_freq[i] = dict()
    N[i] = len(j.split())
    for word in j.split():
        if word in words_freq[i]:
            words_freq[i][word] += 1
        else:
            words_freq[i][word] = 1

        if word in pondered_freq[i]:
            pondered_freq[i][word] += 1
        else:
            pondered_freq[i][word] = 1

for i in pondered_freq:
    for word in pondered_freq[i]:
        pondered_freq[i][word] /= N[doc]

In [149]:
idf = {}
tf_idf = {}

for word in vocab:
    idf[word] = 0
    for i in words_freq:
        if word in words_freq[i]:
            idf[word] += 1

for word in idf:
      idf[word] = math.log(len(corpus)/idf[word])

for i in pondered_freq:
    tf_idf[i] = dict()
    for word in pondered_freq[i]:
        tf_idf[i][word] = pondered_freq[i][word]*idf[word]

In [150]:
df = pd.DataFrame().from_records(tf_idf).fillna(0).T
df

Unnamed: 0,this,airfrance,ber,oldest,ifes,ive,ever,seen,belongs,museum,...,rakiichak,sensational,kislanykim,laxcdg,tops,yourself,screen,gastronomiein,afrofoodie,philippeloretstudios
0,0.29692,0.037604,0.788398,0.875042,0.875042,0.483105,0.506987,0.631803,0.737715,0.875042,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.00000,0.037604,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.00000,0.037604,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.00000,0.037604,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.00000,0.075209,0.000000,0.000000,0.000000,0.483105,0.506987,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1092,0.00000,0.037604,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1093,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.875042,0.875042,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1094,0.00000,0.075209,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.875042,0.875042,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1095,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.875042,0.875042,0.875042,0.000000,0.000000,0.000000
