# Challenge 1

--> Install and test NLTK

In [1]:
import nltk

In [2]:
from nltk.corpus import brown
nltk.download('brown')

[nltk_data] Downloading package brown to /home/paula/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
print(brown.words()[0:10])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of']


In [4]:
print(brown.tagged_words()[0:10])

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN')]


In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/paula/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
text = 'Ironhack is a Global Tech School ranked num 2 worldwide. Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do. This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course. We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.'

from nltk import sent_tokenize, word_tokenize

print(sent_tokenize(text))

['Ironhack is a Global Tech School ranked num 2 worldwide.', 'Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.', 'This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course.', 'We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon.']


In [7]:
print(word_tokenize(text))

['Ironhack', 'is', 'a', 'Global', 'Tech', 'School', 'ranked', 'num', '2', 'worldwide', '.', 'Our', 'mission', 'is', 'to', 'help', 'people', 'transform', 'their', 'careers', 'and', 'join', 'a', 'thriving', 'community', 'of', 'tech', 'professionals', 'that', 'love', 'what', 'they', 'do', '.', 'This', 'ideology', 'is', 'reflected', 'in', 'our', 'teaching', 'practices', ',', 'which', 'consist', 'of', 'a', 'nine-weeks', 'immersive', 'programming', ',', 'UX/UI', 'design', 'or', 'Data', 'Analytics', 'course', 'as', 'well', 'as', 'a', 'one-week', 'hiring', 'fair', 'aimed', 'at', 'helping', 'our', 'students', 'change', 'their', 'career', 'and', 'get', 'a', 'job', 'straight', 'after', 'the', 'course', '.', 'We', 'are', 'present', 'in', '8', 'countries', 'and', 'have', 'campuses', 'in', '9', 'locations', '-', 'Madrid', ',', 'Barcelona', ',', 'Miami', ',', 'Paris', ',', 'Mexico', 'City', ',', 'Berlin', ',', 'Amsterdam', ',', 'Sao', 'Paulo', 'and', 'Lisbon', '.']


# Challenge 2

--> Define functions: text cleaning, tokenization, stemming, lemmatization, and stop words removal

In [8]:
# TEXT CLEANING: special characters, numbers, and URLs

# INPUT: @Ironhack's-#Q website 776-is http://ironhack.com [(2018)]")
# OUTPUT: ironhack s  q website  is

import re

def clean_up(s):
    s = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', s)
    s = s.lower()
    s = re.sub('[^a-z]+', ' ', s)
    return s.strip()

In [9]:
s = "@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\")"

clean_up(s)

'ironhack s q website is'

In [10]:
# TOKENIZATION

def tokenize(s):
    s = word_tokenize(s)
    return s

In [11]:
s = clean_up(s)

tokenize(s)

['ironhack', 's', 'q', 'website', 'is']

In [12]:
#STEMMING AND LEMMATIZATION

from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

def stem_and_lemmatize(l):
    new_l = []
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    for word in l:
        word = lemmatizer.lemmatize(ps.stem(word))
        new_l.append(word)
    return new_l     

[nltk_data] Downloading package wordnet to /home/paula/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
l= tokenize(s)

stem_and_lemmatize(l)

['ironhack', 's', 'q', 'websit', 'is']

In [14]:
# STOP WORDS REMOVAL
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(l):
    stop = set(stopwords.words('english'))
    for word in l:
        if word in stop:
            l.remove(word)
    return l

[nltk_data] Downloading package stopwords to /home/paula/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
l = stem_and_lemmatize(l)

remove_stopwords(l)

['ironhack', 'q', 'websit']

# Challenge 3

--> Sentiment Analysis

In [16]:
# TRY PACKAGE

from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
txt = "Ironhack is a Global Tech School ranked num 2 worldwide. Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do."
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(txt)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/paula/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


{'compound': 0.8442, 'neg': 0.0, 'neu': 0.741, 'pos': 0.259}

In [19]:
# Loading and Exploring Data
import pandas as pd

data= pd.read_csv('Sentiment140.csv')

data_sample = data.sample(n=20000, random_state=1)

In [22]:
display(data.shape)
display(data_sample.shape)
display(data_sample.dtypes)
display(data_sample.head())

(1600000, 6)

(20000, 6)

target     int64
id         int64
date      object
flag      object
user      object
text      object
dtype: object

Unnamed: 0,target,id,date,flag,user,text
514293,0,2190584004,Tue Jun 16 03:08:48 PDT 2009,NO_QUERY,Vicki_Gee,i miss nikki nu nu already shes always there ...
142282,0,1881451988,Fri May 22 04:42:15 PDT 2009,NO_QUERY,PatCashin,So I had a dream last night. I remember a sig...
403727,0,2058252964,Sat Jun 06 14:34:17 PDT 2009,NO_QUERY,deelectable,@girlyghost ohh poor sickly you (((hugs)) ho...
649503,0,2237307600,Fri Jun 19 05:34:22 PDT 2009,NO_QUERY,justinekepa,it is raining again
610789,0,2224301193,Thu Jun 18 09:20:06 PDT 2009,NO_QUERY,cmatt007,@MissKeriBaby wish I was in LA right now


In [24]:
# Prepare Textual Data for Sentiment Analysis

def clean_f(x): 
    functions = [clean_up, tokenize, stem_and_lemmatize, remove_stopwords]
    for f in functions: 
        x = f(x)
    return x
    
data_sample['text_processed']=data_sample.text.apply(clean_f)

data_sample.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
514293,0,2190584004,Tue Jun 16 03:08:48 PDT 2009,NO_QUERY,Vicki_Gee,i miss nikki nu nu already shes always there ...,"[miss, nikki, nu, nu, alreadi, alway, when, ne..."
142282,0,1881451988,Fri May 22 04:42:15 PDT 2009,NO_QUERY,PatCashin,So I had a dream last night. I remember a sig...,"[dream, last, night, rememb, sign, clearli, to..."
403727,0,2058252964,Sat Jun 06 14:34:17 PDT 2009,NO_QUERY,deelectable,@girlyghost ohh poor sickly you (((hugs)) ho...,"[girlyghost, ohh, poor, sickli, hug, hope, fee..."
649503,0,2237307600,Fri Jun 19 05:34:22 PDT 2009,NO_QUERY,justinekepa,it is raining again,"[is, rain]"
610789,0,2224301193,Thu Jun 18 09:20:06 PDT 2009,NO_QUERY,cmatt007,@MissKeriBaby wish I was in LA right now,"[misskeribabi, wish, wa, la, right]"


In [35]:
# Creating Bag of Words: top 5,000 words
from nltk.probability import FreqDist

words_list = []

for row in data_sample['text_processed']:
    for word in row:
        words_list.append(word)
    
fdist = FreqDist(words_list)
voc = fdist.most_common(5000)
top = [x[0] for x in voc]

print(top)  # las primeras 9 palabras deberia haberlas eliminado alguno de los filtros anteiores??

['the', 'i', 'a', 'go', 'my', 'get', 's', 'm', 'wa', 'day', 't', 'thi', 'good', 'like', 'love', 'you', 'work', 'to', 'it', 'have', 'got', 'u', 'quot', 'time', 'today', 'miss', 'want', 'lol', 'be', 'back', 'thank', 'one', 'realli', 'know', 'im', 'think', 'amp', 'see', 'feel', 'watch', 'need', 'still', 'well', 'night', 'make', 'hope', 'oh', 'can', 'home', 'look', 'new', 'na', 'ha', 'is', 'me', 'that', 'come', 'twitter', 'much', 'do', 'just', 'last', 'in', 'am', 'not', 'so', 'wish', 'morn', 'great', 'wait', 'and', 'are', 'll', 'all', 'sad', 'tomorrow', 'would', 'haha', 'sleep', 'right', 'whi', 'fun', 'thing', 'follow', 'tonight', 'onli', 'happi', 'friend', 'on', 'week', 'nice', 'tri', 'bad', 'veri', 'sorri', 'hi', 'don', 'take', 'say', 'for', 'way', 'better', 'school', 'now', 'had', 'could', 're', 'yeah', 'hate', 'will', 'bed', 'peopl', 'start', 'tweet', 'gon', 'your', 'of', 'though', 'hour', 'show', 'even', 'guy', 've', 'weekend', 'play', 'too', 'everyon', 'let', 'littl', 'final', 'lt', 

In [37]:
# Building Features

def find_features(document):
    words = set(document)
    features = {}
    for w in top:
        features[w] = (w in words)
    s = SentimentIntensityAnalyzer().polarity_scores(" ".join(document))
    if s["pos"] > 0.2:
        s = True
    else:
        s = False
    return (features, s)

feature = list(data_sample.text_processed.apply(find_features))

In [39]:
# Building and Traininng Naive Bayes Model

from nltk.corpus import names

train_set, test_set = feature[500:], feature[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

classifier.show_most_informative_features()

Most Informative Features
                    love = True             True : False  =     50.4 : 1.0
                      xd = True             True : False  =     48.3 : 1.0
                   great = True             True : False  =     45.4 : 1.0
                    best = True             True : False  =     41.3 : 1.0
                     wow = True             True : False  =     33.6 : 1.0
                    xoxo = True             True : False  =     33.4 : 1.0
                   super = True             True : False  =     32.0 : 1.0
                   proud = True             True : False  =     27.4 : 1.0
                   bless = True             True : False  =     25.4 : 1.0
                     woo = True             True : False  =     25.4 : 1.0


In [41]:
print ('The accuracy of the model Naive Bayes is: {}'.format(nltk.classify.accuracy(classifier, test_set)))

The accuracy of the model Naive Bayes is: 0.856
