## Install and Import

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# natural language toolkit
!pip install nltk contractions

Collecting contractions
  Downloading contractions-0.1.68-py2.py3-none-any.whl (8.1 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 27.6 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 36.2 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.0 contractions-0.1.68 pyahocorasick-1.4.4 textsearch-0.0.21


🔑    :     https://www.nltk.org/api/nltk.tokenize.html

In [None]:
import nltk
import numpy as np
import pandas as pd 
import contractions

### Notebook settings

In [None]:
pd.set_option('display.max_colwidth', None)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## Tokenization

In [None]:
sample_text= """This is pretty cool. A good quality candy might cost $3.88 in New York. 
                But I don't think we buy it. Mr.Biden said $1,000,000. 2 cars."""

In [None]:
dir(nltk.tokenize)

['BlanklineTokenizer',
 'LineTokenizer',
 'MWETokenizer',
 'PunktSentenceTokenizer',
 'RegexpTokenizer',
 'ReppTokenizer',
 'SExprTokenizer',
 'SpaceTokenizer',
 'StanfordSegmenter',
 'TabTokenizer',
 'TextTilingTokenizer',
 'ToktokTokenizer',
 'TreebankWordTokenizer',
 'TweetTokenizer',
 'WhitespaceTokenizer',
 'WordPunctTokenizer',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_treebank_word_tokenizer',
 'api',
 'blankline_tokenize',
 'casual',
 'casual_tokenize',
 'improved_close_quote_regex',
 'improved_open_quote_regex',
 'improved_punct_regex',
 'line_tokenize',
 'load',
 'mwe',
 'punkt',
 're',
 'regexp',
 'regexp_span_tokenize',
 'regexp_tokenize',
 'repp',
 'sent_tokenize',
 'sexpr',
 'sexpr_tokenize',
 'simple',
 'stanford_segmenter',
 'string_span_tokenize',
 'texttiling',
 'toktok',
 'treebank',
 'util',
 'word_tokenize',
 'wordpunct_tokenize']

In [None]:
from nltk.tokenize import sent_tokenize, wordpunct_tokenize, word_tokenize

#### Sentence Tokenization

In [None]:
# To use tokenziers
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
sentence_tokens = sent_tokenize(sample_text.lower()) # Sensitive to punctuation '.' vs ','
print(sentence_tokens)

['this is pretty cool.', 'a good quality candy might cost $3.88 in new york.', "but i don't think we buy it.", 'mr.biden said $1,000,000.', '2 cars.']


#### WordPunct Tokenization

In [None]:
wordpunc_tokens = wordpunct_tokenize(sample_text.lower()) # regular-expression based tokenizer, which splits text on whitespace and punctuation
print(wordpunc_tokens)

['this', 'is', 'pretty', 'cool', '.', 'a', 'good', 'quality', 'candy', 'might', 'cost', '$', '3', '.', '88', 'in', 'new', 'york', '.', 'but', 'i', 'don', "'", 't', 'think', 'we', 'buy', 'it', '.', 'mr', '.', 'biden', 'said', '$', '1', ',', '000', ',', '000', '.', '2', 'cars', '.']


#### Word Tokenization

In [None]:
word_tokens = word_tokenize(sample_text.lower())
print(word_tokens)

['this', 'is', 'pretty', 'cool', '.', 'a', 'good', 'quality', 'candy', 'might', 'cost', '$', '3.88', 'in', 'new', 'york', '.', 'but', 'i', 'do', "n't", 'think', 'we', 'buy', 'it', '.', 'mr.biden', 'said', '$', '1,000,000', '.', '2', 'cars', '.']


## Removing Punctuation and Numbers

In [None]:
tokens_without_punc = [w for w in word_tokens if w.isalpha()] # .isalnum() for number and object # we are losing mr.biden
print(tokens_without_punc)

['this', 'is', 'pretty', 'cool', 'a', 'good', 'quality', 'candy', 'might', 'cost', 'in', 'new', 'york', 'but', 'i', 'do', 'think', 'we', 'buy', 'it', 'said', 'cars']


## Removing Stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = stopwords.words("english")
print(stop_words)
print('len stop_words :', len(stop_words))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
words_to_exclude_from_stopwords = ['not', "n't", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", "don't", 'hadn', 
                                   "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 
                                   'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', 
                                   "won't", 'wouldn', "wouldn't"]

new_stopwords = [w for w in stop_words if w not in words_to_exclude_from_stopwords]
print('len new_stop_words :', len(new_stopwords))

len new_stop_words : 141


In [None]:
print(tokens_without_punc)

['this', 'is', 'pretty', 'cool', 'a', 'good', 'quality', 'candy', 'might', 'cost', 'in', 'new', 'york', 'but', 'i', 'do', 'think', 'we', 'buy', 'it', 'said', 'cars']


In [None]:
token_without_sw = [t for t in tokens_without_punc if t not in new_stopwords] # stop_words
print(token_without_sw)

['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'cars']


## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
WordNetLemmatizer().lemmatize("driving")
WordNetLemmatizer().lemmatize("driver")
WordNetLemmatizer().lemmatize("drives")

'driving'

'driver'

'drive'

In [None]:
lem = [WordNetLemmatizer().lemmatize(t) for t in token_without_sw]

In [None]:
print(token_without_sw)
print(lem)

['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'cars']
['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'car']


## Stemming

In [None]:
from nltk.stem import PorterStemmer

In [None]:
PorterStemmer().stem("driving")
PorterStemmer().stem("driver")
PorterStemmer().stem("drives")

'drive'

'driver'

'drive'

In [None]:
stem = [PorterStemmer().stem(t) for t in token_without_sw]

In [None]:
print('w/o norm :', token_without_sw)
print('stem     :', stem)
print('lemma    :', lem)

w/o norm : ['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'cars']
stem     : ['pretti', 'cool', 'good', 'qualiti', 'candi', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'car']
lemma    : ['pretty', 'cool', 'good', 'quality', 'candy', 'might', 'cost', 'new', 'york', 'think', 'buy', 'said', 'car']


## Joining

In [None]:
" ".join(lem)

'pretty cool good quality candy might cost new york think buy said car'

### Expanding Contractions

In [None]:
my_text  = word_tokenize(contractions.fix("I won't be there"))#"I'll go there I've got a book".lower()))
my_text
#[w for w in my_text if w.isalpha()]

['I', 'will', 'not', 'be', 'there']

### Part of Speech Tag


In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
from nltk import pos_tag

In [None]:
text = """Steven Paul Jobs was an American business magnate, industrial designer, investor, and media proprietor. 
He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc.; the chairman and majority shareholder of Pixar; 
a member of The Walt Disney Company's board of directors following its acquisition of Pixar; and the founder, chairman, and CEO of NeXT."""

In [None]:
tokens = word_tokenize(text)
pos = pos_tag(tokens)
pos

[('Steven', 'NNP'),
 ('Paul', 'NNP'),
 ('Jobs', 'NNP'),
 ('was', 'VBD'),
 ('an', 'DT'),
 ('American', 'JJ'),
 ('business', 'NN'),
 ('magnate', 'NN'),
 (',', ','),
 ('industrial', 'JJ'),
 ('designer', 'NN'),
 (',', ','),
 ('investor', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('media', 'NNS'),
 ('proprietor', 'NN'),
 ('.', '.'),
 ('He', 'PRP'),
 ('was', 'VBD'),
 ('the', 'DT'),
 ('chairman', 'NN'),
 (',', ','),
 ('chief', 'JJ'),
 ('executive', 'NN'),
 ('officer', 'NN'),
 ('(', '('),
 ('CEO', 'NNP'),
 (')', ')'),
 (',', ','),
 ('and', 'CC'),
 ('co-founder', 'NN'),
 ('of', 'IN'),
 ('Apple', 'NNP'),
 ('Inc.', 'NNP'),
 (';', ':'),
 ('the', 'DT'),
 ('chairman', 'NN'),
 ('and', 'CC'),
 ('majority', 'NN'),
 ('shareholder', 'NN'),
 ('of', 'IN'),
 ('Pixar', 'NNP'),
 (';', ':'),
 ('a', 'DT'),
 ('member', 'NN'),
 ('of', 'IN'),
 ('The', 'DT'),
 ('Walt', 'NNP'),
 ('Disney', 'NNP'),
 ('Company', 'NNP'),
 ("'s", 'POS'),
 ('board', 'NN'),
 ('of', 'IN'),
 ('directors', 'NNS'),
 ('following', 'VBG'),
 ('its', 

### NER

In [None]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
from nltk import ne_chunk

In [None]:
import matplotlib as mpl
import os

print(text, '\n')
for chunk in nltk.ne_chunk(pos):
      if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk))

Steven Paul Jobs was an American business magnate, industrial designer, investor, and media proprietor. 
He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc.; the chairman and majority shareholder of Pixar; 
a member of The Walt Disney Company's board of directors following its acquisition of Pixar; and the founder, chairman, and CEO of NeXT. 

PERSON Steven
PERSON Paul Jobs
GPE American
ORGANIZATION CEO
ORGANIZATION Apple Inc.
GPE Pixar
ORGANIZATION Walt Disney Company
GPE Pixar
ORGANIZATION CEO
ORGANIZATION NeXT


## Cleaning Function

In [None]:
##### !!!!!!!!!!
a = "I don't want to fly with your company." # vs 
[token for token in word_tokenize(a) if token not in stop_words] 
b = "I do not want to fly with your company" 
[token for token in word_tokenize(b) if token not in stop_words] 

['I', "n't", 'want', 'fly', 'company', '.']

['I', 'want', 'fly', 'company']

In [None]:
"I don't want to fly with your company.".replace("'", "")

'I dont want to fly with your company.'

In [None]:
def cleaning(data):
    
    #1. Contractions Expension & Tokenize
    #text_tokens = word_tokenize(contractions.fix(data.lower())) 
    text_tokens = word_tokenize(data.replace("'", '').lower())
    
    #2. Remove Puncs
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]
    
    #3. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #4. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

## CountVectorization and TF-IDF Vectorization

#### Data
🔑 Source: https://www.kaggle.com/crowdflower/twitter-airline-sentiment?select=Tweets.csv

In [None]:
# For Colab
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Clarusway_NLP/Clarusway/clarusway-ds-students-7-21-main/3- Classes_Labs/NLP/NLP-1/airline_tweets.csv')
#df = pd.read_csv("airline_tweets.csv")

In [None]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [None]:
df = df[['airline_sentiment','text']]
df.rename(columns={'airline_sentiment':'sentiment'}, inplace=True)
df.sample(20)

Unnamed: 0,sentiment,text
11195,negative,@USAirways another bad experience today. Frozen pipes on 597. Missing connection.
13773,negative,"@AmericanAir Thank you for holding, we apologize for the delay in answering your call. To speak to a representative please continue to hold."
12916,negative,@AmericanAir I guess it is more BS AirportCardio given you cannot have an on time flight
10757,negative,"@USAirways it MIGHT have went through. No confirmation. ""Please wait"" then same form."
12503,positive,@AmericanAir lovely flight back from MIA to LHR - great crew - thanks :-))
737,negative,@lindaSWC @united: We don't like to hear you had a poor experience. Please share details w/our Customer Care team http://t.co/HIsc4NdMgZ.
2218,negative,"@united #customerservice at @Dulles_Airport could not be worse. I get the bad weather, but this is awful."
9625,positive,@USAirways YOU ARE THE BEST!!! YOU ARE AMAZING!!! FOLLOW ME PLEASE;)🙏🙏🙏
5873,positive,@SouthwestAir me &amp; @sammi_jon3s are best friends because of @Imaginedragons. Any chance we could get tickets to #DestinationDragons ?
10505,negative,@USAirways Your Baggage system has hung up on me twice because you have too many callers. I NEED TO FIND MY HUSBAND'S (@SweetingR) BAGS.


In [None]:
df2 = df.copy()

In [None]:
df2["clean_text"] = df2["text"].apply(cleaning)
df2.head()

Unnamed: 0,sentiment,text,clean_text
0,neutral,@VirginAmerica What @dhepburn said.,virginamerica dhepburn said
1,positive,@VirginAmerica plus you've added commercials to the experience... tacky.,virginamerica plus youve added commercial experience tacky
2,neutral,@VirginAmerica I didn't today... Must mean I need to take another trip!,virginamerica didnt today must mean need take another trip
3,negative,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",virginamerica really aggressive blast obnoxious entertainment guest face amp little recourse
4,negative,@VirginAmerica and it's a really big bad thing about it,virginamerica really big bad thing


In [None]:
# URLs
df2[df2['clean_text'].str.contains('http')].head()

Unnamed: 0,sentiment,text,clean_text
7,neutral,"@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP",virginamerica really missed prime opportunity men without hat parody http
13,positive,@VirginAmerica @virginmedia I'm flying your #fabulous #Seductive skies again! U take all the #stress away from travel http://t.co/ahlXHhKiyn,virginamerica virginmedia im flying fabulous seductive sky u take stress away travel http
21,positive,@VirginAmerica I love this graphic. http://t.co/UT5GrRwAaA,virginamerica love graphic http
34,positive,@VirginAmerica this is great news! America could start flights to Hawaii by end of year http://t.co/r8p2Zy3fe4 via @Pacificbiznews,virginamerica great news america could start flight hawaii end year http via pacificbiznews
35,neutral,Nice RT @VirginAmerica: Vibe with the moodlight from takeoff to touchdown. #MoodlitMonday #ScienceBehindTheExperience http://t.co/Y7O0uNxTQP,nice rt virginamerica vibe moodlight takeoff touchdown moodlitmonday sciencebehindtheexperience http


In [None]:
# Tags
df2[df2['text'].str.contains('#')].head(3)

Unnamed: 0,sentiment,text,clean_text
13,positive,@VirginAmerica @virginmedia I'm flying your #fabulous #Seductive skies again! U take all the #stress away from travel http://t.co/ahlXHhKiyn,virginamerica virginmedia im flying fabulous seductive sky u take stress away travel http
16,positive,@VirginAmerica So excited for my first cross country flight LAX to MCO I've heard nothing but great things about Virgin America. #29DaysToGo,virginamerica excited first cross country flight lax mco ive heard nothing great thing virgin america
26,negative,@VirginAmerica What happened 2 ur vegan food options?! At least say on ur site so i know I won't be able 2 eat anything for next 6 hrs #fail,virginamerica happened ur vegan food option least say ur site know wont able eat anything next hr fail


In [None]:
# Mentions
df2[df2['text'].str.contains('@')].head(3)

Unnamed: 0,sentiment,text,clean_text
0,neutral,@VirginAmerica What @dhepburn said.,virginamerica dhepburn said
1,positive,@VirginAmerica plus you've added commercials to the experience... tacky.,virginamerica plus youve added commercial experience tacky
2,neutral,@VirginAmerica I didn't today... Must mean I need to take another trip!,virginamerica didnt today must mean need take another trip


In [None]:
df2.sample(10)

Unnamed: 0,sentiment,text,clean_text
1725,neutral,@united...do you still have flat tire policy. Shuttle broke down on way to ORD. Will probably miss the 425pm to CLE...Help please!!,united still flat tire policy shuttle broke way ord probably miss cle help please
11951,negative,@AmericanAir you guys are killing me. http://t.co/22iPGeIcSm,americanair guy killing http
3299,negative,@united your airline is a joke. 1 person working special services at EWR?!? Line is 15 ppl deep. GROW UP!,united airline joke person working special service ewr line ppl deep grow
7583,negative,@JetBlue That's not what I heard. Weather was fine this morning. Flight 136 was circling for some time.Someone forgot to clean the runway.,jetblue thats heard weather fine morning flight circling forgot clean runway
12301,negative,@AmericanAir over 70 days no contact from a human or apology letter. @AmericanAir is #yucki. Read all about it soon http://t.co/9R9OmzQAVI,americanair day contact human apology letter americanair yucki read soon http
14575,negative,@AmericanAir don't you guys have an email address? Just put me on the next available flight from ohare,americanair dont guy email address put next available flight ohare
2063,negative,@united - you sure missed the mark on tonight's redeye from LAX to Chicago. What a mess! You can do better!,united sure missed mark tonight redeye lax chicago mess better
5942,negative,@SouthwestAir there is a good chance myself nor anyone else in this airport will ever be Flight Booking Problems southwest again..,southwestair good chance anyone else airport ever flight booking problem southwest
13780,negative,"@AmericanAir Hey you Cancelled Flightled my flight, and I can't get someone on the phone to rebook (2 hour wait minimum). Can I get some service?",americanair hey cancelled flightled flight cant get someone phone rebook hour wait minimum get service
2177,negative,@united Sandra at ur international checkin counter was rude and offensive. She commented she didn't care if I complained cuz she had 25yrs,united sandra ur international checkin counter rude offensive commented didnt care complained cuz


In [None]:
# df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Clarusway_NLP/Clarusway/clarusway-ds-students-7-21-main/3- Classes_Labs/NLP/NLP-1/airline_tweets.csv')
# #df = pd.read_csv("airline_tweets.csv")

# df = df[['airline_sentiment','text']]
# df.rename(columns={'airline_sentiment':'sentiment'}, inplace=True)

In [None]:
def updated_cleaning(data):

    import re
    
    #1. Removing URLS
    data = re.sub(r'http\S+', '', data)

    #2. Removing Tags
    data = re.sub(r'#\w+', '', data)

    #3. Removing Mentions
    data = re.sub(r'@\w+', '', data)

    #4. Contractions Expension & Tokenize
    #text_tokens = word_tokenize(contractions.fix(data.lower())) 
    text_tokens = word_tokenize(data.replace("'", '').lower())

    #5. Removing mentions
    tokens_without_mention = [w for w in text_tokens if not w.startswith('@')]
    
    #6. Remove Puncs
    tokens_without_punc = [w for w in tokens_without_mention if w.isalpha()]
    
    #7. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #8. lemma
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    
    #joining
    return " ".join(text_cleaned)

In [None]:
df2["clean_text"] = df2["text"].apply(updated_cleaning)

In [None]:
df2[df2['clean_text'].str.contains('http')]
df2[df2['clean_text'].str.contains('#')]
df2[df2['clean_text'].str.contains('@')]

Unnamed: 0,sentiment,text,clean_text


Unnamed: 0,sentiment,text,clean_text


Unnamed: 0,sentiment,text,clean_text


In [None]:
df2.sample(10)

Unnamed: 0,sentiment,text,clean_text
5805,positive,@SouthwestAir thanks for adding straight flights from Columbus to Oakland!,thanks adding straight flight columbus oakland
13150,negative,@AmericanAir @EdPlotts don't bother trying to get anywhere with their customer service team either as take 2+ months and counting to reply,dont bother trying get anywhere customer service team either take month counting reply
5463,positive,@SouthwestAir @FortuneMagazine \ngreat news.,great news
1432,negative,@united quick (serious) question - any resources/ratings showing the quality of service is better than 8th place? http://t.co/deWIthPeW2,quick serious question showing quality service better place
13074,negative,"@AmericanAir No. Had to Cancelled Flight my trip. Instead of a $25 future trip voucher, a $25 drink coupon would've been better! #WakingInMemphis",cancelled flight trip instead future trip voucher drink coupon wouldve better
12512,negative,@AmericanAir worst experience of my life avoid at all costs they will lose your belongings and have no humanity to even offer compensation,worst experience life avoid cost lose belonging humanity even offer compensation
11837,negative,@USAirways to arrive the plane I'm sitting on needs to take off...wish someone would tell us what the holdup is,arrive plane im sitting need take wish someone would tell u holdup
7014,neutral,“@JetBlue: Our fleet's on fleek. http://t.co/pa7dCjXlzL”\n\nC'mon fam😭😭 just. No. Ok?,fleet fleek cmon ok
8177,negative,"@JetBlue the one man in baggage office says ""I dunno"". HELP",one man baggage office say dunno help
2153,negative,@united our 1 layover itinerary was swapped for a 3 layover itinerary - really?!?,layover itinerary swapped layover itinerary really


## CountVectorization

In [None]:
X = df2[["clean_text"]] # as a dataframe
y = df2["sentiment"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 4299)

In [None]:
X_train.head()

Unnamed: 0,clean_text
3013,haha clean plane held overnight hangar sound lovely also dont lie screensand say weather
13868,let seriously
2391,hello flying first class behind people zone pls pas app dept board class first
6979,thanks much talking article chat came great
8721,claimed happy way treated


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
### DON'T FEED DF

vectorizer1 = CountVectorizer()
X_train_count1 = vectorizer1.fit_transform(X_train) # INPUT: should be a list or pd.Series
X_test_count1 = vectorizer1.transform(X_test['clean_text'])

In [None]:
vectorizer1.get_feature_names()
len(vectorizer1.get_feature_names())

['clean_text']

1

In [None]:
vectorizer1 = CountVectorizer()
X_train_count1 = vectorizer1.fit_transform(X_train['clean_text']) # INPUT: should be a list or pd.Series
X_test_count1 = vectorizer1.transform(X_test['clean_text'])

In [None]:
vectorizer1.get_feature_names()[:20]

['aa',
 'aaaand',
 'aadavantage',
 'aadv',
 'aadvantage',
 'aal',
 'aaron',
 'ab',
 'aback',
 'abandon',
 'abandoned',
 'abandonment',
 'abassinet',
 'abbreve',
 'abc',
 'abducted',
 'ability',
 'able',
 'aboard',
 'abounds']

In [None]:
df2['clean_text'].str.split()

0                                                                                                                     [said]
1                                                                        [plus, youve, added, commercial, experience, tacky]
2                                                                      [didnt, today, must, mean, need, take, another, trip]
3                                  [really, aggressive, blast, obnoxious, entertainment, guest, face, amp, little, recourse]
4                                                                                                  [really, big, bad, thing]
                                                                ...                                                         
14635                                                                               [thank, got, different, flight, chicago]
14637                                                                                     [please, bring, american, airline]


In [None]:
words = pd.Series(" ".join(X_train["clean_text"]).split()).value_counts()

len(words)
print('')
len(vectorizer1.get_feature_names())

7715




7696

In [None]:
one_chars_from_words = [w for w in words.index if len(w)==1]
one_Chars_from_cv = [w for w in vectorizer1.get_feature_names() if len(w)==1]

In [None]:
len(one_chars_from_words)

19

In [None]:
vectorizer1.get_feature_names_out()[:20] # DEPRECATION: get_feature_names()

array(['aa', 'aaaand', 'aadavantage', 'aadv', 'aadvantage', 'aal',
       'aaron', 'ab', 'aback', 'abandon', 'abandoned', 'abandonment',
       'abassinet', 'abbreve', 'abc', 'abducted', 'ability', 'able',
       'aboard', 'abounds'], dtype=object)

#### min_df

In [None]:
X = df2.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 4299)

In [None]:
#https://stackoverflow.com/questions/36572221/how-to-find-ngram-frequency-of-a-column-in-a-pandas-dataframe
# WHITE BOARD: https://whiteboard.office.com/me/whiteboards/13947786-4746-4a0e-ad23-4c720b363d5c
vectorizer2 = CountVectorizer(preprocessor=updated_cleaning, min_df=2, ngram_range=(1,2))
X_train_count2 = vectorizer2.fit_transform(X_train['text'])
X_test_count2 = vectorizer2.transform(X_test['text'])

len(vectorizer2.get_feature_names())

13217

In [None]:
vectorizer2.get_feature_names_out()[:20]

array(['aa', 'aa agent', 'aa amp', 'aa customer', 'aa dallas',
       'aa doesnt', 'aa email', 'aa employee', 'aa family', 'aa flight',
       'aa gate', 'aa gold', 'aa help', 'aa mile', 'aa monday',
       'aa number', 'aa platinum', 'aa possible', 'aa usair', 'aa would'],
      dtype=object)

In [None]:
X_train_count2.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
count_df = pd.DataFrame(X_train_count2.toarray(), columns = vectorizer2.get_feature_names())
count_df.iloc[:,100:130]

Unnamed: 0,ad,adam,add,add aa,add child,add companion,add fee,add flight,add grr,add insult,add ktn,add lap,add new,add passbook,added,adding,adding flight,addition,additional,additional fee,address,address issue,address please,address send,addressed,addressing,addtl,adjacent,adjustment,admiral
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11707,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11708,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11709,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11710,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# TOP 20 TOKENS
print('Before min_df')
list(zip(vectorizer1.get_feature_names(), X_train_count1.toarray().sum(axis=0)))[:20]
print('\n')
print('After min_df')
list(zip(vectorizer2.get_feature_names(), X_train_count2.toarray().sum(axis=0)))[:20]

Before min_df


[('aa', 167),
 ('aaaand', 1),
 ('aadavantage', 1),
 ('aadv', 2),
 ('aadvantage', 9),
 ('aal', 1),
 ('aaron', 1),
 ('ab', 1),
 ('aback', 1),
 ('abandon', 1),
 ('abandoned', 1),
 ('abandonment', 1),
 ('abassinet', 1),
 ('abbreve', 1),
 ('abc', 6),
 ('abducted', 1),
 ('ability', 4),
 ('able', 93),
 ('aboard', 3),
 ('abounds', 1)]



After min_df


[('aa', 167),
 ('aa agent', 3),
 ('aa amp', 4),
 ('aa customer', 4),
 ('aa dallas', 2),
 ('aa doesnt', 2),
 ('aa email', 2),
 ('aa employee', 4),
 ('aa family', 2),
 ('aa flight', 5),
 ('aa gate', 2),
 ('aa gold', 2),
 ('aa help', 3),
 ('aa mile', 2),
 ('aa monday', 2),
 ('aa number', 2),
 ('aa platinum', 2),
 ('aa possible', 2),
 ('aa usair', 2),
 ('aa would', 2)]

In [None]:
X_train.loc[2, :]

clean_text    didnt today must mean need take another trip
Name: 2, dtype: object

## TF-IDF

🔑 : sklearn TD-IDF
https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf_idf_vectorizer = TfidfVectorizer()
X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train['clean_text'])
X_test_tf_idf = tf_idf_vectorizer.transform(X_test['clean_text'])

In [None]:
tf_idf_vectorizer.get_feature_names()[:25]

['aa',
 'aaaand',
 'aadavantage',
 'aadv',
 'aadvantage',
 'aal',
 'aaron',
 'ab',
 'aback',
 'abandon',
 'abandoned',
 'abandonment',
 'abassinet',
 'abbreve',
 'abc',
 'abducted',
 'ability',
 'able',
 'aboard',
 'abounds',
 'abq',
 'abroad',
 'absolute',
 'absolutely',
 'absorb']

In [None]:
X_train_tf_idf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names())[:10]

Unnamed: 0,aa,aaaand,aadavantage,aadv,aadvantage,aal,aaron,ab,aback,abandon,abandoned,abandonment,abassinet,abbreve,abc,abducted,ability,able,aboard,abounds,abq,abroad,absolute,absolutely,absorb,absorber,absoulutely,absurd,absurdity,abt,abundance,abuse,abused,abysmal,ac,acc,accelerate,accept,acceptable,accepted,...,yest,yesterday,yet,yield,yikes,yo,yoga,yogurt,york,youd,youll,young,younger,youre,yout,youth,youve,yow,ypu,yr,ystday,ystrdy,yuck,yuma,yummy,yup,yvonne,yvr,yxu,yyc,yyz,zabsonre,zambia,zero,zip,zipper,zone,zoom,zukes,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.312499,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.350284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X_train.loc[5]

clean_text    seriously would pay flight seat didnt playing really bad thing flying va
Name: 5, dtype: object

In [None]:
pd.DataFrame(X_train_tf_idf.toarray(), columns = tf_idf_vectorizer.get_feature_names()).loc[1].sort_values(ascending=False)[:30]

seriously       0.774457
let             0.632627
fit             0.000000
finish          0.000000
finnair         0.000000
fire            0.000000
fired           0.000000
firefighter     0.000000
firefox         0.000000
firing          0.000000
firm            0.000000
first           0.000000
fiscal          0.000000
fist            0.000000
fistfight       0.000000
fitz            0.000000
finest          0.000000
five            0.000000
fix             0.000000
fixed           0.000000
fixing          0.000000
fjbfsc          0.000000
fl              0.000000
flagging        0.000000
flailing        0.000000
flamethrower    0.000000
flat            0.000000
flattering      0.000000
flavor          0.000000
finger          0.000000
Name: 1, dtype: float64