## Install and Import

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install nltk



In [3]:
import nltk

In [5]:
# natural language toolkit
!pip install nltk contractions

Collecting contractions
  Downloading contractions-0.1.68-py2.py3-none-any.whl (8.1 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 8.0 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 71.4 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.0 contractions-0.1.68 pyahocorasick-1.4.4 textsearch-0.0.21


🔑    :     https://www.nltk.org/api/nltk.tokenize.html

In [6]:
import nltk
import numpy as np
import pandas as pd 
import contractions

### Notebook settings

In [7]:
pd.set_option('display.max_colwidth', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## Tokenization

In [8]:
sample_text= """This is pretty cool. A good quality candy might cost $3.88 in New York. 
                But I don't think we buy it. Mr.Biden said $1,000,000. 2 cars."""

In [10]:
#dir(nltk.tokenize)

In [11]:
from nltk.tokenize import sent_tokenize, wordpunct_tokenize, word_tokenize

#### Sentence Tokenization

In [12]:
# To use tokenziers
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
sentence_tokens = sent_tokenize(sample_text.lower()) # Sensitive to punctuation '.' vs ','
sentence_tokens

['this is pretty cool.',
 'a good quality candy might cost $3.88 in new york.',
 "but i don't think we buy it.",
 'mr.biden said $1,000,000.',
 '2 cars.']

#### WordPunct Tokenization

In [16]:
wordpunc_tokens = wordpunct_tokenize(sample_text.lower()) # regular-expression based tokenizer, which splits text on whitespace and punctuation
wordpunc_tokens

['this',
 'is',
 'pretty',
 'cool',
 '.',
 'a',
 'good',
 'quality',
 'candy',
 'might',
 'cost',
 '$',
 '3',
 '.',
 '88',
 'in',
 'new',
 'york',
 '.',
 'but',
 'i',
 'don',
 "'",
 't',
 'think',
 'we',
 'buy',
 'it',
 '.',
 'mr',
 '.',
 'biden',
 'said',
 '$',
 '1',
 ',',
 '000',
 ',',
 '000',
 '.',
 '2',
 'cars',
 '.']

#### Word Tokenization

In [17]:
word_tokens = word_tokenize(sample_text.lower())
print(word_tokens)

['this', 'is', 'pretty', 'cool', '.', 'a', 'good', 'quality', 'candy', 'might', 'cost', '$', '3.88', 'in', 'new', 'york', '.', 'but', 'i', 'do', "n't", 'think', 'we', 'buy', 'it', '.', 'mr.biden', 'said', '$', '1,000,000', '.', '2', 'cars', '.']


In [18]:
"I want to go to my hometown".split()

['I', 'want', 'to', 'go', 'to', 'my', 'hometown']

In [20]:
'@Ankara'.isalpha()

False

## Removing Punctuation and Numbers

In [23]:
tokens_without_punc = [w for w in word_tokens if w.isalpha() or 'mr' in w] # .isalnum() for number and object # we are losing mr.biden
print(tokens_without_punc)

['this', 'is', 'pretty', 'cool', 'a', 'good', 'quality', 'candy', 'might', 'cost', 'in', 'new', 'york', 'but', 'i', 'do', 'think', 'we', 'buy', 'it', 'mr.biden', 'said', 'cars']


## Removing Stopwords

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [25]:
from nltk.corpus import stopwords

In [28]:
stop_words = stopwords.words("english")
print(stop_words)
#print('len stop_words :', len(stop_words))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [29]:
words_to_exclude_from_stopwords = ['not', "n't", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", "don't", 'hadn', 
                                   "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 
                                   'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', 
                                   "won't", 'wouldn', "wouldn't"]

new_stopwords = [w for w in stop_words if w not in words_to_exclude_from_stopwords]
print('len new_stop_words :', len(new_stopwords))

len new_stop_words : 141


In [30]:
print(tokens_without_punc)

['this', 'is', 'pretty', 'cool', 'a', 'good', 'quality', 'candy', 'might', 'cost', 'in', 'new', 'york', 'but', 'i', 'do', 'think', 'we', 'buy', 'it', 'mr.biden', 'said', 'cars']


In [31]:
token_without_sw = [t for t in tokens_without_punc if t not in new_stopwords] # stop_words
print(token_without_sw)

['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'mr.biden', 'said', 'cars']


In [32]:
word_tokenize("Ankara'ya gezmeye gittim.")

["Ankara'ya", 'gezmeye', 'gittim', '.']

In [34]:
# tr_stop_words = stopwords.words("turkish")
# tr_stop_words

## Lemmatization

In [35]:
from nltk.stem import WordNetLemmatizer

In [36]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [39]:
WordNetLemmatizer().lemmatize("driving")
WordNetLemmatizer().lemmatize("driver")
WordNetLemmatizer().lemmatize("drivers")
WordNetLemmatizer().lemmatize("drives")
WordNetLemmatizer().lemmatize("drove")

'driving'

'driver'

'driver'

'drive'

'drove'

In [41]:
lem = [WordNetLemmatizer().lemmatize(t) for t in token_without_sw]
print(token_without_sw)
print(lem)

['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'mr.biden', 'said', 'cars']
['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'mr.biden', 'said', 'car']


## Stemming

In [42]:
from nltk.stem import PorterStemmer

In [44]:
PorterStemmer().stem("driving")
PorterStemmer().stem("driver")
PorterStemmer().stem("drives")
PorterStemmer().stem("drove")

'drive'

'driver'

'drive'

'drove'

In [46]:
stem = [PorterStemmer().stem(t) for t in token_without_sw]
print('w/o norm :', token_without_sw)
print('stem     :', stem)
print('lemma    :', lem)

w/o norm : ['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'mr.biden', 'said', 'cars']
stem     : ['pretti', 'cool', 'good', 'qualiti', 'candi', 'might', 'cost', 'new', 'york', 'think', 'buy', 'mr.biden', 'said', 'car']
lemma    : ['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'mr.biden', 'said', 'car']


## Joining

In [47]:
" ".join(lem)

'pretty cool good quality candy might cost new york think buy mr.biden said car'

### Expanding Contractions

In [48]:
contractions.fix("I won't be there")

'I will not be there'

In [52]:
my_text  = word_tokenize(contractions.fix(sample_text))
print(sample_text)
print(my_text)

This is pretty cool. A good quality candy might cost $3.88 in New York. 
                But I don't think we buy it. Mr.Biden said $1,000,000. 2 cars.
['This', 'is', 'pretty', 'cool', '.', 'A', 'good', 'quality', 'candy', 'might', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'But', 'I', 'do', 'not', 'think', 'we', 'buy', 'it', '.', 'Mr.Biden', 'said', '$', '1,000,000', '.', '2', 'cars', '.']


### Part of Speech Tag


In [53]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [54]:
from nltk import pos_tag

In [55]:
text = """Steven Paul Jobs was an American business magnate, industrial designer, investor, and media proprietor. 
He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc.; the chairman and majority shareholder of Pixar; 
a member of The Walt Disney Company's board of directors following its acquisition of Pixar; and the founder, chairman, and CEO of NeXT."""

In [58]:
tokens = word_tokenize(text)
pos = pos_tag(tokens)
pos

[('Steven', 'NNP'),
 ('Paul', 'NNP'),
 ('Jobs', 'NNP'),
 ('was', 'VBD'),
 ('an', 'DT'),
 ('American', 'JJ'),
 ('business', 'NN'),
 ('magnate', 'NN'),
 (',', ','),
 ('industrial', 'JJ'),
 ('designer', 'NN'),
 (',', ','),
 ('investor', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('media', 'NNS'),
 ('proprietor', 'NN'),
 ('.', '.'),
 ('He', 'PRP'),
 ('was', 'VBD'),
 ('the', 'DT'),
 ('chairman', 'NN'),
 (',', ','),
 ('chief', 'JJ'),
 ('executive', 'NN'),
 ('officer', 'NN'),
 ('(', '('),
 ('CEO', 'NNP'),
 (')', ')'),
 (',', ','),
 ('and', 'CC'),
 ('co-founder', 'NN'),
 ('of', 'IN'),
 ('Apple', 'NNP'),
 ('Inc.', 'NNP'),
 (';', ':'),
 ('the', 'DT'),
 ('chairman', 'NN'),
 ('and', 'CC'),
 ('majority', 'NN'),
 ('shareholder', 'NN'),
 ('of', 'IN'),
 ('Pixar', 'NNP'),
 (';', ':'),
 ('a', 'DT'),
 ('member', 'NN'),
 ('of', 'IN'),
 ('The', 'DT'),
 ('Walt', 'NNP'),
 ('Disney', 'NNP'),
 ('Company', 'NNP'),
 ("'s", 'POS'),
 ('board', 'NN'),
 ('of', 'IN'),
 ('directors', 'NNS'),
 ('following', 'VBG'),
 ('its', 

### NER

In [59]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [60]:
from nltk import ne_chunk

In [61]:
# import matplotlib as mpl
# import os

print(text, '\n')
for chunk in nltk.ne_chunk(pos):
      if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk))

Steven Paul Jobs was an American business magnate, industrial designer, investor, and media proprietor. 
He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc.; the chairman and majority shareholder of Pixar; 
a member of The Walt Disney Company's board of directors following its acquisition of Pixar; and the founder, chairman, and CEO of NeXT. 

PERSON Steven
PERSON Paul Jobs
GPE American
ORGANIZATION CEO
ORGANIZATION Apple Inc.
GPE Pixar
ORGANIZATION Walt Disney Company
GPE Pixar
ORGANIZATION CEO
ORGANIZATION NeXT


## Cleaning Function

In [63]:
"I don't want to fly with your company.".replace("'", "")

'I dont want to fly with your company.'

In [64]:
def cleaning(data):
    
    #1. Contractions Expension & Tokenize
    #text_tokens = word_tokenize(contractions.fix(data.lower())) 
    text_tokens = word_tokenize(data.replace("'", '').lower())
    
    #2. Remove Puncs
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    #3. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #4. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

## CountVectorization and TF-IDF Vectorization

#### Data
🔑 Source: https://www.kaggle.com/crowdflower/twitter-airline-sentiment?select=Tweets.csv

In [65]:
# For Colab
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [66]:
import pandas as pd
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Clarusway_NLP/Clarusway/clarusway-ds-students-7-21-main/3- Classes_Labs/NLP/NLP-1/airline_tweets.csv')
#df = pd.read_csv("airline_tweets.csv")

In [67]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [69]:
df = df[['airline_sentiment','text']]
df.rename(columns={'airline_sentiment':'sentiment'}, inplace=True)
df.sample(20)

Unnamed: 0,sentiment,text
13676,negative,@AmericanAir my mom’s flight tomorrow Cancelled Flighted w/ no notice. Phone system said callback in 2 hours and it’s been 4. I need to go to bed!
12710,positive,"@AmericanAir thanks. I actually made it, my connection flight was delayed. Guess all delays are not a bad thing. http://t.co/XGgCNTco8m"
6976,neutral,@JetBlue two rows
8444,positive,"Hey @JetBlue, that's a sexy tattoo you got there on your left engine. #Jetbae http://t.co/Ox4w6KtsGI"
10543,negative,@USAirways the American Eagle plane you're using for CLT to RDU is disgusting! You should be ashamed! #disgusting #ew http://t.co/B4xhiRuGzV
4870,neutral,@SouthwestAir yes please.my son lives in NJ.
529,negative,@united beginning of Feb I called United they said they would send another voucher by mail. Never got anything. #tiredofwaiting
7573,positive,@JetBlue Thanks for the personalized customer service! #cannedtweet #autoresponse
8783,positive,@JetBlue That's great! Thank you.
588,negative,@united another fail for the United ticket agents in OKC. LESS than helpful and could care less about our problems. American here we come.


In [70]:
df2 = df.copy()

In [71]:
df2["clean_text"] = df2["text"].apply(cleaning)


Unnamed: 0,sentiment,text,clean_text
0,neutral,@VirginAmerica What @dhepburn said.,virginamerica dhepburn said
1,positive,@VirginAmerica plus you've added commercials to the experience... tacky.,virginamerica plus youve added commercial experience tacky
2,neutral,@VirginAmerica I didn't today... Must mean I need to take another trip!,virginamerica didnt today must mean need take another trip
3,negative,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",virginamerica really aggressive blast obnoxious entertainment guest face amp little recourse
4,negative,@VirginAmerica and it's a really big bad thing about it,virginamerica really big bad thing


In [72]:
df2.sample(10)

Unnamed: 0,sentiment,text,clean_text
11358,negative,"@USAirways that's understandable, my issue is with creating a new flight without the personnel to do it...I changed my plans to accommodate",usairways thats understandable issue creating new flight without personnel changed plan accommodate
77,neutral,@VirginAmerica first time flying you all. do you have a different rate/policy for media Bags? Thanks,virginamerica first time flying different medium bag thanks
11131,negative,@USAirways @AmericanAir Will you be destroying lives in Eastern @GastonCounty during this lively little event? For profits you will.,usairways americanair destroying life eastern gastoncounty lively little event profit
8606,neutral,@JetBlue I sent you an email,jetblue sent email
7585,neutral,@JetBlue whatever your lil #mint heart desires! http://t.co/WmX12F33ZC,jetblue whatever lil mint heart desire http
1464,negative,@united the delay is due to customer service for 20 people? What about the DIS-SERVICE you provided for 100+? #FlightFail #Hour20Delay,united delay due customer service people provided flightfail
12204,negative,"@AmericanAir 1-the lavatory freezes, 2- problem with a nitrogen line 3-a low tire with the inflating equipment malfunctioning #AA2444 and...",americanair lavatory freeze problem nitrogen line low tire inflating equipment malfunctioning
11905,neutral,@AmericanAir I did,americanair
14631,negative,@AmericanAir thx for nothing on getting us out of the country and back to US. Broken plane? Come on. Get another one.,americanair thx nothing getting u country back u broken plane come get another one
992,negative,"@united found a group of people who actually hates people more than ISIS, thats right United Airlines.",united found group people actually hate people isi thats right united airline


In [73]:
# URLs
df2[df2['clean_text'].str.contains('http')].head()

Unnamed: 0,sentiment,text,clean_text
7,neutral,"@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP",virginamerica really missed prime opportunity men without hat parody http
13,positive,@VirginAmerica @virginmedia I'm flying your #fabulous #Seductive skies again! U take all the #stress away from travel http://t.co/ahlXHhKiyn,virginamerica virginmedia im flying fabulous seductive sky u take stress away travel http
21,positive,@VirginAmerica I love this graphic. http://t.co/UT5GrRwAaA,virginamerica love graphic http
34,positive,@VirginAmerica this is great news! America could start flights to Hawaii by end of year http://t.co/r8p2Zy3fe4 via @Pacificbiznews,virginamerica great news america could start flight hawaii end year http via pacificbiznews
35,neutral,Nice RT @VirginAmerica: Vibe with the moodlight from takeoff to touchdown. #MoodlitMonday #ScienceBehindTheExperience http://t.co/Y7O0uNxTQP,nice rt virginamerica vibe moodlight takeoff touchdown moodlitmonday sciencebehindtheexperience http


In [74]:
# Tags
df2[df2['text'].str.contains('#')].head(3)

Unnamed: 0,sentiment,text,clean_text
13,positive,@VirginAmerica @virginmedia I'm flying your #fabulous #Seductive skies again! U take all the #stress away from travel http://t.co/ahlXHhKiyn,virginamerica virginmedia im flying fabulous seductive sky u take stress away travel http
16,positive,@VirginAmerica So excited for my first cross country flight LAX to MCO I've heard nothing but great things about Virgin America. #29DaysToGo,virginamerica excited first cross country flight lax mco ive heard nothing great thing virgin america
26,negative,@VirginAmerica What happened 2 ur vegan food options?! At least say on ur site so i know I won't be able 2 eat anything for next 6 hrs #fail,virginamerica happened ur vegan food option least say ur site know wont able eat anything next hr fail


In [75]:
def updated_cleaning(data):

    import re
    
    #1. Removing URLS
    data = re.sub(r'http\S+', '', data)

    #2. Removing Tags
    data = re.sub(r'#\w+', '', data)

    #3. Removing Mentions
    data = re.sub(r'@\w+', '', data)

    #4. Contractions Expension & Tokenize
    #text_tokens = word_tokenize(contractions.fix(data.lower())) 
    text_tokens = word_tokenize(data.replace("'", '').lower())

    #5. Removing mentions
    tokens_without_mention = [w for w in text_tokens if not w.startswith('@')]
    
    #6. Remove Puncs
    tokens_without_punc = [w for w in tokens_without_mention if w.isalpha()]
    
    #7. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #8. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

In [76]:
df2["clean_text"] = df2["text"].apply(updated_cleaning)

In [77]:
df2[df2['clean_text'].str.contains('http')]
df2[df2['clean_text'].str.contains('#')]
df2[df2['clean_text'].str.contains('@')]

Unnamed: 0,sentiment,text,clean_text


Unnamed: 0,sentiment,text,clean_text


Unnamed: 0,sentiment,text,clean_text


In [78]:
df2.sample(10)

Unnamed: 0,sentiment,text,clean_text
2753,neutral,@united I just had an interview how long does it take before you hear back,interview long take hear back
8243,positive,@JetBlue excellent. you guys are the best,excellent guy best
2717,negative,@United can you let us out of the gate now. UA1157,let u gate
12356,negative,@AmericanAir how do you NOT do maintenance on #MD80 while it sits for two days? Frozen lines found after its boarded? Come on! #faail #mci,maintenance sits two day frozen line found boarded come
5706,neutral,@SouthwestAir what is the status on flight#122 STL-AUS?,status flight
3350,negative,@united question - was given food vouchers but can't use on plane..how come,question given food voucher cant use come
12932,negative,@AmericanAir im tryin to book a flight but cant get ahold of anyone!,im tryin book flight cant get ahold anyone
11963,negative,"@AmericanAir lost my cats, missed their flights, kept them crated 30 hrs for a would-be 5 hr trip. You'll never touch my pets again.",lost cat missed flight kept crated hr hr trip youll never touch pet
4610,negative,@SouthwestAir- is new #MKT strategy to be average like all the rest? #whathappend? RR Points Devalued- AGAIN -http://t.co/mDbDYomrs7,new strategy average like rest rr point
14084,negative,@AmericanAir @cjdjpdx not a valid response in 2015 for a multinational corp whose profits are measured in billions. Stop understaffing!,valid response multinational corp whose profit measured billion stop understaffing


## CountVectorization

In [106]:
X = df2["clean_text"]
y = df2["sentiment"]

In [80]:
from sklearn.model_selection import train_test_split

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 4299)

In [89]:
X_train.head()

3013     haha clean plane held overnight hangar sound lovely also dont lie screensand say weather
13868                                                                               let seriously
2391               hello flying first class behind people zone pls pas app dept board class first
6979                                                  thanks much talking article chat came great
8721                                                                    claimed happy way treated
Name: clean_text, dtype: object

In [84]:
from sklearn.feature_extraction.text import CountVectorizer

In [90]:
vectorizer1 = CountVectorizer()
X_train_count1 = vectorizer1.fit_transform(X_train)

In [95]:
vectorizer1.get_feature_names()

['aa',
 'aaaand',
 'aadavantage',
 'aadv',
 'aadvantage',
 'aal',
 'aaron',
 'ab',
 'aback',
 'abandon',
 'abandoned',
 'abandonment',
 'abassinet',
 'abbreve',
 'abc',
 'abducted',
 'ability',
 'able',
 'aboard',
 'abounds',
 'abq',
 'abroad',
 'absolute',
 'absolutely',
 'absorb',
 'absorber',
 'absoulutely',
 'absurd',
 'absurdity',
 'abt',
 'abundance',
 'abuse',
 'abused',
 'abysmal',
 'ac',
 'acc',
 'accelerate',
 'accept',
 'acceptable',
 'accepted',
 'accepting',
 'acces',
 'access',
 'accessible',
 'accessing',
 'accident',
 'accidentally',
 'accomidating',
 'accommodate',
 'accommodated',
 'accommodating',
 'accommodation',
 'accompaniment',
 'accomplish',
 'accomplished',
 'according',
 'accordingly',
 'account',
 'accountability',
 'accountable',
 'accrue',
 'acct',
 'accts',
 'accumulation',
 'accurate',
 'accurately',
 'accused',
 'acknowledge',
 'acknowledgement',
 'acknowledgment',
 'acosta',
 'acoustic',
 'acpt',
 'acquire',
 'acquired',
 'acquisition',
 'across',
 'ac

In [94]:
len(vectorizer1.get_feature_names())

7696

In [96]:
df2['clean_text']

0                                                                                                        said
1                                                                plus youve added commercial experience tacky
2                                                                didnt today must mean need take another trip
3                              really aggressive blast obnoxious entertainment guest face amp little recourse
4                                                                                        really big bad thing
                                                         ...                                                 
14635                                                                      thank got different flight chicago
14637                                                                           please bring american airline
14638                                        money change flight dont answer phone suggestion make commitment
14639     

In [97]:
df2['clean_text'].str.split()

0                                                                                                                     [said]
1                                                                        [plus, youve, added, commercial, experience, tacky]
2                                                                      [didnt, today, must, mean, need, take, another, trip]
3                                  [really, aggressive, blast, obnoxious, entertainment, guest, face, amp, little, recourse]
4                                                                                                  [really, big, bad, thing]
                                                                ...                                                         
14635                                                                               [thank, got, different, flight, chicago]
14637                                                                                     [please, bring, american, airline]


In [100]:
X_train

3013     haha clean plane held overnight hangar sound lovely also dont lie screensand say weather
13868                                                                               let seriously
2391               hello flying first class behind people zone pls pas app dept board class first
6979                                                  thanks much talking article chat came great
8721                                                                    claimed happy way treated
                                                   ...                                           
2695                                question interview process one interview last person good bad
9228                            ive hold hour cc mile arent showing mediocre combo cc amp airline
5237     yes total hour hold cancelled flightlations one would think would staff decided cx drive
4022                                                       chicago hometown airline care neighbor
8237                

In [99]:
words = pd.Series(" ".join(X_train).split()).value_counts()

len(words)
print('')
len(vectorizer1.get_feature_names())

7715




7696

In [104]:
words

flight       3608
get          1102
hour          929
thanks        852
cancelled     841
             ... 
mo              1
tim             1
shipping        1
anderson        1
relaxing        1
Length: 7715, dtype: int64

In [101]:
one_chars_from_words = [w for w in words.index if len(w)==1]
one_Chars_from_cv = [w for w in vectorizer1.get_feature_names() if len(w)==1]

In [103]:
one_chars_from_words

['u',
 'w',
 'b',
 'c',
 'r',
 'n',
 'f',
 'p',
 'x',
 'v',
 'l',
 'e',
 'h',
 'j',
 'k',
 'g',
 'q',
 'o',
 'm']

In [102]:
len(one_chars_from_words)

19

#### min_df

In [107]:
X = df2.copy()#['text']

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 4299)

In [110]:
X_train.head()

Unnamed: 0,sentiment,text,clean_text
3013,negative,@united haha and you have to clean a plane that was held overnight in a hangar. Sounds lovely. Also don't lie on screensand say it's weather,haha clean plane held overnight hangar sound lovely also dont lie screensand say weather
13868,negative,@AmericanAir you have let me down. Seriously. #unhappycustomer,let seriously
2391,negative,@united hello I am flying first class and am behind 20 people on zone 1!!!!! Pls pass on to app dept - you should board 1st class first,hello flying first class behind people zone pls pas app dept board class first
6979,positive,@JetBlue Thanks so much for talking to me! The article about #Twitter chats came out great! http://t.co/rKorHvR9z1 #contentmarketing,thanks much talking article chat came great
8721,negative,"@JetBlue She claimed that she did, but I was not happy with the way I was treated.",claimed happy way treated


In [109]:
vectorizer2 = CountVectorizer(preprocessor=updated_cleaning, min_df=2, ngram_range=(1,2))
X_train_count2 = vectorizer2.fit_transform(X_train['text'])
X_test_count2 = vectorizer2.transform(X_test['text'])

In [111]:
len(vectorizer2.get_feature_names())

13234

In [113]:
vectorizer2.get_feature_names_out()[3000:3050]

array(['disaster', 'disconnect', 'disconnected', 'disconnected call',
       'disconnected customer', 'disconnected please',
       'disconnected time', 'disconnection', 'discontinued', 'discount',
       'discount airfare', 'discovered', 'discrimination', 'discus',
       'discus experience', 'disgrace', 'disgraceful', 'disgusted',
       'disgusting', 'dislike', 'dislike delay', 'disney',
       'disorganization', 'disorganized', 'dispatch', 'dispatcher',
       'display', 'displayed', 'displeased', 'disregard',
       'disregard customer', 'disrespect', 'disrespectful', 'disrupted',
       'disruption', 'dissapointed', 'dissatisfied', 'distance',
       'distribution', 'diversion', 'divert', 'diverted', 'dividend',
       'dividend member', 'dividend mile', 'dl', 'dm', 'dm conf',
       'dm confirmation', 'dm detail'], dtype=object)

In [115]:
X_train_count2.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [118]:
count_df = pd.DataFrame(X_train_count2.toarray(), columns = vectorizer2.get_feature_names())
count_df.iloc[:,500:590]

Unnamed: 0,amp getting,amp give,amp got,amp great,amp half,amp hang,amp help,amp hold,amp ice,amp ill,...,another flight,another gate,another go,another great,another grievance,another hour,another hr,another min,another minute,another night
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11707,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11708,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11709,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11710,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [119]:
# TOP 20 TOKENS
print('Before min_df')
list(zip(vectorizer1.get_feature_names(), X_train_count1.toarray().sum(axis=0)))[:20]
print('\n')
print('After min_df')
list(zip(vectorizer2.get_feature_names(), X_train_count2.toarray().sum(axis=0)))[:20]

Before min_df


[('aa', 167),
 ('aaaand', 1),
 ('aadavantage', 1),
 ('aadv', 2),
 ('aadvantage', 9),
 ('aal', 1),
 ('aaron', 1),
 ('ab', 1),
 ('aback', 1),
 ('abandon', 1),
 ('abandoned', 1),
 ('abandonment', 1),
 ('abassinet', 1),
 ('abbreve', 1),
 ('abc', 6),
 ('abducted', 1),
 ('ability', 4),
 ('able', 93),
 ('aboard', 3),
 ('abounds', 1)]



After min_df


[('aa', 167),
 ('aa agent', 3),
 ('aa amp', 4),
 ('aa customer', 4),
 ('aa dallas', 2),
 ('aa doesnt', 2),
 ('aa email', 2),
 ('aa employee', 4),
 ('aa family', 2),
 ('aa flight', 5),
 ('aa gate', 2),
 ('aa gold', 2),
 ('aa help', 3),
 ('aa mile', 2),
 ('aa monday', 2),
 ('aa number', 2),
 ('aa platinum', 2),
 ('aa possible', 2),
 ('aa usair', 2),
 ('aa would', 2)]

## TF-IDF

🔑 : sklearn TD-IDF
https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer