# Imports

In [1]:
import re
import numpy as np
import pandas as pd
from nltk.tokenize import TweetTokenizer

%run -i ../functions.py

Imports and functions loaded.


# Task 1
Build a tokanizer that works like the nltk `TweetTokenizer`.

## Regex library
To make things a bit easier

In [2]:
# imported from: https://www.nlpnotebook.com/python/tokenization/nlp/normalization/project/2020/01/06/lets-make-a-tokenizer-part-3.html 
ALPHA = '[A-Z]+'
DIGITS = '[0-9]'
BOS = '^'
EOS = '$'
PLUS = '+'
STAR = "*"
PERIOD = r'\.'
INITIAL_PUNCTUATION = '[\'"]'
FINAL_PUNCTUATION = '[\',!?":.]'
CURRENCY_SYMBOL = '[$£¥€]'
QUESTION_MARK = '?'

In [3]:
# defined by me
NOTALPHA = r'[^a-zA-Z\s]' # also excludes spaces
HASHTAGS = r'#\w+'
WORD = r'\w+'

## Open file

In [4]:
with open('../datasets/sentiment/train_text.txt', 'r') as f:
    tweets = [line.strip() for line in f]

## Functions

In [5]:
def find_hashtags(string):
    return re.findall(HASHTAGS, string)

def find_nonwords(string):
    return re.findall(NOTALPHA, string)

In [6]:
# the simplest tokenizer possible - split only by space
def split_space(string):
    return string.split(' ')

## Tokenizer

In [7]:
tweet_split = []
for tweet in tweets:
    tweet_split.append(split_space(tweet))

In [8]:
tweets[0]

'"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"'

In [129]:
def hashtag_only_tokenizer(listofstrings):
    tokens = []
    unmatchables = []

    for string in listofstrings:
        hashtags = re.findall(r'#(\w+)', string)
        tokens.append(hashtags)

        not_hashtags = []
        all_words = re.findall(r'\w+', string)
        for word in all_words:
            if word not in hashtags:
                not_hashtags.append(word)
        unmatchables.append(not_hashtags)

    return (tokens, unmatchables)

    # /^((?!#(\w+)).)*$/

In [130]:
test_hashtag = hashtag_only_tokenizer(tweets)
print(test_hashtag[0][0])
print(test_hashtag[1][0])
print(tweets[0])

['HappyBirthdayRemusLupin']
['QT', 'user', 'In', 'the', 'original', 'draft', 'of', 'the', '7th', 'book', 'Remus', 'Lupin', 'survived', 'the', 'Battle', 'of', 'Hogwarts']
"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"


In [11]:
def nonword_only_tokenizer(listofstrings):
    tokens = []
    unmatchables = []

    for string in listofstrings:
        tokens.append(re.findall(r'[^a-zA-Z\s]', string))
        unmatchables.append(re.findall(r'[\w]+', string))

    return (tokens, unmatchables)

In [12]:
test_nonword = nonword_only_tokenizer(tweets)
print(test_nonword[0][0])
print(test_nonword[1][0])

['"', '@', '7', ',', '.', '#', '"']
['QT', 'user', 'In', 'the', 'original', 'draft', 'of', 'the', '7th', 'book', 'Remus', 'Lupin', 'survived', 'the', 'Battle', 'of', 'Hogwarts', 'HappyBirthdayRemusLupin']


### Testing functions

In [13]:
# find all hashtags
print(find_hashtags(tweets[0]))

['#HappyBirthdayRemusLupin']


In [14]:
find_nonwords(tweets[0])

['"', '@', '7', ',', '.', '#', '"']

In [15]:
# baseline tokenizer
TweetTokenizer().tokenize(tweets[0])

['"',
 'QT',
 '@user',
 'In',
 'the',
 'original',
 'draft',
 'of',
 'the',
 '7th',
 'book',
 ',',
 'Remus',
 'Lupin',
 'survived',
 'the',
 'Battle',
 'of',
 'Hogwarts',
 '.',
 '#HappyBirthdayRemusLupin',
 '"']

## Working with dataframes

In [16]:
sentiment_train = pd.read_csv("../datasets/sentiment/train_text.txt", sep="\t", names=["tweets"], quoting=3)

sentiment_test = pd.read_csv("../datasets/sentiment/test_text.txt", sep="\t", names=["tweets"], quoting=3)

sentiment_val = pd.read_csv("../datasets/sentiment/val_text.txt", sep="\t", names=["tweets"], quoting=3)

In [17]:
# https://www.geeksforgeeks.org/python-pandas-series-str-extractall/
# https://stackoverflow.com/questions/42379389/finding-all-regex-matches-from-a-pandas-dataframe-column

sentiment_train.tweets.str.extract(r'([^a-zA-Z\s])', expand=False)

0        "
1        "
2        .
3        '
4        @
        ..
45610    @
45611    9
45612    1
45613    @
45614    (
Name: tweets, Length: 45615, dtype: object

In [18]:
type(sentiment_train['tweets'])

pandas.core.series.Series

In [19]:
# the fuction still works with dataframes, just feed in the column as series
print(type(sentiment_train['tweets']))
nonword_only_tokenizer(sentiment_train['tweets'])[0][0]

<class 'pandas.core.series.Series'>


['"', '@', '7', ',', '.', '#', '"']

In [20]:
# Rasmus function
def tokenize_ideal(line):
    tokens = []
    unmatchables = []
    
    for word in line.split():
        if re.findall(r"\w+-\w+|\w+|[.&?%!#…]", word) != []:
            x = re.findall(r"\w+-\w+|\w+|[.&?%!#…]+", word)
            for element in x:
                tokens.append(element)

        if re.findall(r"\w+-\w+|\w+|[.&?%!#…]", word) != [word] and re.findall(r"[^\w|.&!?%#…]+", word) != []:
            unmatchables.append(re.findall(r"[^\w|.!#?%…&]+", word)[0])


    return (tokens, unmatchables)

## Difference between tokenizers

In [21]:
import difflib # how to use this to compare 2 lists??

In [22]:
sentiment_train.tweets[0]

'"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin" '

In [23]:
baseline = TweetTokenizer().tokenize(tweets[0])
ideal_test = tokenize_ideal(sentiment_train.tweets[0])

In [24]:
# diff = difflib.Differ().compare(baseline, ideal_test)
# print('\n'.join(diff))

In [25]:
baseline

['"',
 'QT',
 '@user',
 'In',
 'the',
 'original',
 'draft',
 'of',
 'the',
 '7th',
 'book',
 ',',
 'Remus',
 'Lupin',
 'survived',
 'the',
 'Battle',
 'of',
 'Hogwarts',
 '.',
 '#HappyBirthdayRemusLupin',
 '"']

In [26]:
ideal_test[0]

['QT',
 'user',
 'In',
 'the',
 'original',
 'draft',
 'of',
 'the',
 '7th',
 'book',
 'Remus',
 'Lupin',
 'survived',
 'the',
 'Battle',
 'of',
 'Hogwarts',
 '.',
 '#',
 'HappyBirthdayRemusLupin']

# Exercise 2

In [27]:
tokenized_train = []

for tweet in sentiment_train["tweets"]:
    tokenized_train.append(tokenize_ideal(tweet)[0])

In [28]:
tokenized_test = []

for tweet in sentiment_test["tweets"]:
    tokenized_test.append(tokenize_ideal(tweet)[0])

In [29]:
tokenized_val = []

for tweet in sentiment_val["tweets"]:
    tokenized_val.append(tokenize_ideal(tweet)[0])

In [30]:
text = tokenized_train 
# test = tokenized_test

The following was done as per: http://www.nltk.org/api/nltk.lm.html#module-nltk.lm

In [31]:
from nltk.util import bigrams # to train a bigram model
from nltk.util import pad_sequence # pad sentences to know where each sentence beging and end, especially as they will be flattened later
from nltk.lm.preprocessing import pad_both_ends # not sure how this differs from the above
from nltk.util import everygrams
from nltk.lm.preprocessing import flatten # instead of separating each sentence by having them in lists, the dataset is flattened with <s> and </s> indicating the start end end of sentences

# pre processing
from nltk.lm.preprocessing import padded_everygram_pipeline # generates train data and vocabulary automatically

# train models
from nltk.lm import MLE # train a Maximum Likelihood Estimator


In [32]:
list(bigrams(text[0]));

In [33]:
list(pad_sequence(text[0], 
    pad_left=True,
    left_pad_symbol="<s>",
    pad_right=True,
    right_pad_symbol="</s>",
    n=2))

['<s>',
 'QT',
 'user',
 'In',
 'the',
 'original',
 'draft',
 'of',
 'the',
 '7th',
 'book',
 'Remus',
 'Lupin',
 'survived',
 'the',
 'Battle',
 'of',
 'Hogwarts',
 '.',
 '#',
 'HappyBirthdayRemusLupin',
 '</s>']

In [34]:
list(pad_both_ends(text[0], n=2))

['<s>',
 'QT',
 'user',
 'In',
 'the',
 'original',
 'draft',
 'of',
 'the',
 '7th',
 'book',
 'Remus',
 'Lupin',
 'survived',
 'the',
 'Battle',
 'of',
 'Hogwarts',
 '.',
 '#',
 'HappyBirthdayRemusLupin',
 '</s>']

In [35]:
list(bigrams(pad_both_ends(text[0], n=2)))

[('<s>', 'QT'),
 ('QT', 'user'),
 ('user', 'In'),
 ('In', 'the'),
 ('the', 'original'),
 ('original', 'draft'),
 ('draft', 'of'),
 ('of', 'the'),
 ('the', '7th'),
 ('7th', 'book'),
 ('book', 'Remus'),
 ('Remus', 'Lupin'),
 ('Lupin', 'survived'),
 ('survived', 'the'),
 ('the', 'Battle'),
 ('Battle', 'of'),
 ('of', 'Hogwarts'),
 ('Hogwarts', '.'),
 ('.', '#'),
 ('#', 'HappyBirthdayRemusLupin'),
 ('HappyBirthdayRemusLupin', '</s>')]

In [36]:
padded_bigrams = list(pad_both_ends(text[0], n=2))
list(everygrams(padded_bigrams, max_len=2))

[('<s>',),
 ('QT',),
 ('user',),
 ('In',),
 ('the',),
 ('original',),
 ('draft',),
 ('of',),
 ('the',),
 ('7th',),
 ('book',),
 ('Remus',),
 ('Lupin',),
 ('survived',),
 ('the',),
 ('Battle',),
 ('of',),
 ('Hogwarts',),
 ('.',),
 ('#',),
 ('HappyBirthdayRemusLupin',),
 ('</s>',),
 ('<s>', 'QT'),
 ('QT', 'user'),
 ('user', 'In'),
 ('In', 'the'),
 ('the', 'original'),
 ('original', 'draft'),
 ('draft', 'of'),
 ('of', 'the'),
 ('the', '7th'),
 ('7th', 'book'),
 ('book', 'Remus'),
 ('Remus', 'Lupin'),
 ('Lupin', 'survived'),
 ('survived', 'the'),
 ('the', 'Battle'),
 ('Battle', 'of'),
 ('of', 'Hogwarts'),
 ('Hogwarts', '.'),
 ('.', '#'),
 ('#', 'HappyBirthdayRemusLupin'),
 ('HappyBirthdayRemusLupin', '</s>')]

In [37]:
list(flatten(pad_both_ends(sent, n=2) for sent in text));

In [38]:
# In most cases we want to use the same text as the source for both vocabulary and ngram counts. Now that we understand what this means for our preprocessing, we can simply import a function that does everything for us.

train, vocab = padded_everygram_pipeline(2, text)

In [39]:
# Having prepared our data we are ready to start training a model. As a simple example, let us train a Maximum Likelihood Estimator (MLE). We only need to specify the highest ngram order to instantiate it.
lm = MLE(2) 
# This automatically creates an empty vocabulary
len(lm.vocab)

0

In [40]:
# fit the model, whatever that means
lm.fit(train, vocab)
print(lm.vocab) 
#The vocabulary helps us handle words that have not occurred during training.
len(lm.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 57964 items>


57964

In [41]:
print(lm.counts)

<NgramCounter with 2 ngram orders and 2096573 ngrams>


In [42]:
lm.counts['Remus']

7

In [43]:
lm.counts[['#']]['Happy']

3

In [44]:
lm.score('the')

0.03255269845597118

In [45]:
lm.score("<UNK>") == lm.score("jdfhasjdhaskj")
# True -> No occurences of "jdfhasjdhaskj" in the dataset

True

In [46]:
# get the score for a word given some preceding context. For example we want to know what is the chance that “t” is preceded by “h”.
lm.score("t", ["h"])

0.0625

In [47]:
# to avoid underflow when working with small score values, use logscore method
print(lm.logscore("Remus"))
print(lm.logscore("the"))

-17.223298744892467
-4.941079049219879


In [48]:
# test[2]

In [49]:
# lm.score(test[2])

In [50]:
lm.generate(5)

['please', '.', '#', 'Broncos', 'Peyton']

In [51]:
test = list(bigrams(text[0]))
lm.entropy(test) # wtf is this

5.609879131064775

In [52]:
lm.perplexity(test) # wtf is this

48.8362030201234

## Counting ngrams
Whatever this is

In [53]:
from nltk.util import ngrams
text_bigrams = [ngrams(sent, 2) for sent in text]
text_unigrams = [ngrams(sent, 1) for sent in text]

In [54]:
from nltk.lm import NgramCounter
ngram_counts = NgramCounter(text_bigrams + text_unigrams)

In [55]:
ngram_counts['the']

34867

In [56]:
ngram_counts['Remus']

7

In [57]:
mle_perplexity(sentiment_train['tweets'], 2)

[48.8362030201234,
 110.80707253157334,
 84.28509337005167,
 45.91584481487316,
 109.0441935906818,
 45.08854905757948]

## Smoothing
Need to figure out how to do Kneser-Ney smoothing.  
Not sure what the input is for FreqDist and KneserNeyProbDist.

In [58]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [59]:
alist = tweet_to_list('../datasets/irony/train_text.txt')

In [60]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sabri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [99]:
token = tokenize_ideal(alist[0])
token0 = list(ngrams(token[0], 3))
token0

[('seeing', 'ppl', 'walking'),
 ('ppl', 'walking', 'w'),
 ('walking', 'w', 'crutches'),
 ('w', 'crutches', 'makes'),
 ('crutches', 'makes', 'me'),
 ('makes', 'me', 'really'),
 ('me', 'really', 'excited'),
 ('really', 'excited', 'for'),
 ('excited', 'for', 'the'),
 ('for', 'the', 'next'),
 ('the', 'next', '3'),
 ('next', '3', 'weeks'),
 ('3', 'weeks', 'of'),
 ('weeks', 'of', 'my'),
 ('of', 'my', 'life')]

In [107]:
tokens_sent = [tokenize_ideal(item) for item in alist]
trigrams = [ngrams(sent, 3) for sent in tokens_sent]
fdist = FreqDist([item for l in trigrams for item in l])
# fdists = [FreqDist(token) for token in tokens]

In [101]:
fdist

FreqDist({})

In [64]:
kneser_ney = nltk.KneserNeyProbDist(fdist)

In [65]:
kneser_ney.prob(('I', 'can', 't'))

0.0

# Exercise 3

In [66]:
from  nltk.metrics import agreement

In [67]:
og = pd.read_csv("../datasets/annotations/iaa_labels.txt", names=["og"])
sab = pd.read_csv("../datasets/annotations/annotations_sabrina.csv", names=["sab"])
ida = pd.read_csv("../datasets/annotations/annotations_ida.csv", names=["ida"])
rub = pd.read_csv("../datasets/annotations/annotations_ruben.csv", names=["rub"])
ras = pd.read_csv("../datasets/annotations/annotations_rasmus.csv", names=["ras"])
mag = pd.read_csv("../datasets/annotations/annotations_magnus.csv", names=["mag"])

In [68]:
# create a dictonary
us = [sab, ida, rub, ras, mag]
og = [og, sab, ida, rub, ras, mag]

us_df = pd.concat(us, axis = 1)
og_df = pd.concat(og, axis = 1)

In [69]:
us_list = ['sab', 'ida', 'rub', 'ras', 'mag']
og_list = ['og', 'sab', 'ida', 'rub', 'ras', 'mag']

In [70]:
def annotation_task(annotators_list, df):
    ls = []
    for annotator in annotators_list:
        for i in range(len(df)):
            ls.append((annotator, i, df[annotator][i]))
    return ls

In [71]:
us_data = annotation_task(us_list, us_df)
og_data = annotation_task(og_list, og_df)

In [72]:
us_task = agreement.AnnotationTask(data=us_data)
print("Average observed agreement:", us_task.avg_Ao())
print("Scott's Pi:", us_task.pi())
print("Cohen's Kappa:", us_task.kappa())
print("Fleiss's Kappa:", us_task.multi_kappa())
print("Krippendorf's Alpha:", us_task.alpha())

Average observed agreement: 0.6016666666666668
Scott's Pi: 0.19416471857579604
Cohen's Kappa: 0.2445540955840891
Fleiss's Kappa: 0.2251999135508971
Krippendorf's Alpha: 0.19550777737816938


In [73]:
og_task = agreement.AnnotationTask(data=og_data)
print("Average observed agreement:", og_task.avg_Ao())
print("Scott's Pi:", og_task.pi())
print("Cohen's Kappa:", og_task.kappa())
print("Fleiss's Kappa:", og_task.multi_kappa())
print("Krippendorf's Alpha:", og_task.alpha())

Average observed agreement: 0.6327777777777777
Scott's Pi: 0.25771459319509304
Cohen's Kappa: 0.29228296799333725
Fleiss's Kappa: 0.27701618783724646
Krippendorf's Alpha: 0.2587455451489895


In [98]:
og_task

<nltk.metrics.agreement.AnnotationTask at 0x28e27a82d60>

# Exercise 4

In [74]:
irony_train = pd.read_csv("../datasets/irony/train_text.txt", sep="\t", names=["tweets"], quoting=3)
irony_test = pd.read_csv("../datasets/irony/test_text.txt", sep="\t", names=["tweets"], quoting=3)
irony_val = pd.read_csv("../datasets/irony/val_text.txt", sep="\t", names=["tweets"], quoting=3)

irony_train_labels = pd.read_csv("../datasets/irony/train_labels.txt", sep="\t", names=["tweets"], quoting=3).values.flatten()
irony_test_labels = pd.read_csv("../datasets/irony/test_labels.txt", sep="\t", names=["tweets"], quoting=3).values.flatten()
irony_val_labels = pd.read_csv("../datasets/irony/val_labels.txt", sep="\t", names=["tweets"], quoting=3).values.flatten()

In [75]:
len(irony_train_labels)

2862

In [76]:
sentences_train_irony = []

for tweet in irony_train["tweets"]:
    sentences_train_irony.append(tweet)

In [77]:
sentences_test_irony = []

for tweet in irony_test["tweets"]:
    sentences_test_irony.append(tweet)

In [78]:
print(len(sentences_train_irony))
print(len(sentences_test_irony))

2862
784


In [79]:
# https://www.mygreatlearning.com/blog/bag-of-words/

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')
#transform
Count_data = CountVec.fit_transform(sentences_train_irony)
 
#create dataframe
cv_dataframe = pd.DataFrame(Count_data.toarray(), columns = CountVec.get_feature_names())

In [80]:
cv_dataframe

Unnamed: 0,00,000,01273276865,034i,07,0_0,10,100,1000,1000sms,...,zebras,zen,zero,zimbabwe,zuckerberg,zzzz,ëœå,งวย,你就算5隔格我都知你讲me,ｆｏｌｌｏｗ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2857,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2858,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2859,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
# https://www.mygreatlearning.com/blog/bag-of-words/

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# without smooth IDF
# define tf-idf
tf_idf_vec = TfidfVectorizer(use_idf=True, 
                        smooth_idf=False,  
                        ngram_range=(1,1),stop_words='english') # to use only  bigrams ngram_range=(2,2)
# transform
tf_idf_data = tf_idf_vec.fit_transform(sentences_train_irony)
 
# create dataframe
tf_idf_dataframe = pd.DataFrame(tf_idf_data.toarray(),columns=tf_idf_vec.get_feature_names())
 
# with smooth IDF
tf_idf_vec_smooth = TfidfVectorizer(use_idf=True,  
                        smooth_idf=True,  
                        ngram_range=(1,1),stop_words='english')
 
 
tf_idf_data_smooth = tf_idf_vec_smooth.fit_transform(sentences_train_irony)
tf_idf_dataframe_smooth = pd.DataFrame(tf_idf_data_smooth.toarray(), columns = tf_idf_vec_smooth.get_feature_names())

In [82]:
# without smooth IDF
tf_idf_dataframe

Unnamed: 0,00,000,01273276865,034i,07,0_0,10,100,1000,1000sms,...,zebras,zen,zero,zimbabwe,zuckerberg,zzzz,ëœå,งวย,你就算5隔格我都知你讲me,ｆｏｌｌｏｗ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
# with smooth IDF
tf_idf_dataframe_smooth

Unnamed: 0,00,000,01273276865,034i,07,0_0,10,100,1000,1000sms,...,zebras,zen,zero,zimbabwe,zuckerberg,zzzz,ëœå,งวย,你就算5隔格我都知你讲me,ｆｏｌｌｏｗ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Scikit Tutorial
https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [84]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sentences_train_irony)
X_train_counts.shape

(2862, 7991)

In [85]:
count_vect.vocabulary_.get(u'apple')

496

In [86]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2862, 7991)

In [87]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2862, 7991)

### MultinomialNB

In [88]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, irony_train_labels)

In [89]:
categories = ['0', '1']

In [90]:
X_new_counts = count_vect.transform(sentences_test_irony)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

# create a dictonary
data = {"tweets": pd.Series(sentences_test_irony),
        "actual": pd.Series(irony_test_labels),
        "predicted": pd.Series(predicted)}

# dataframe of test data and predicted label
df = pd.concat(data, axis = 1)

In [91]:
df

Unnamed: 0,tweets,actual,predicted
0,@user Can U Help?||More conservatives needed o...,0,0
1,"Just walked in to #Starbucks and asked for a ""...",1,1
2,#NOT GONNA WIN,0,0
3,@user He is exactly that sort of person. Weirdo!,0,0
4,So much #sarcasm at work mate 10/10 #boring 10...,1,1
...,...,...,...
779,"If you drag yesterday into today, your tomorro...",0,0
780,Congrats to my fav @user & her team & my birth...,0,1
781,@user Jessica sheds tears at her fan signing e...,0,1
782,#Irony: al jazeera is pro Anti - #GamerGate be...,1,1


In [92]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
    ])

In [93]:
text_clf.fit(sentences_train_irony, irony_train_labels)

predicted = text_clf.predict(sentences_test_irony)
np.mean(predicted == irony_test_labels)

0.6543367346938775

### SGDClassifier

In [94]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
    ])

text_clf.fit(sentences_train_irony, irony_train_labels)

predicted = text_clf.predict(sentences_test_irony)
np.mean(predicted == irony_test_labels)

0.6492346938775511

In [95]:
from sklearn import metrics
print(metrics.classification_report(irony_test_labels, predicted, target_names=['not ironic', 'ironic']))

              precision    recall  f1-score   support

  not ironic       0.73      0.66      0.69       473
      ironic       0.55      0.63      0.59       311

    accuracy                           0.65       784
   macro avg       0.64      0.65      0.64       784
weighted avg       0.66      0.65      0.65       784



In [96]:
metrics.confusion_matrix(irony_test_labels, predicted)

array([[313, 160],
       [115, 196]], dtype=int64)

In [97]:
train_hate = file_to_array('../datasets/hate/test_labels.txt')

NameError: name 'file_to_array' is not defined

In [352]:
train_hate

array([1, 1, 1, ..., 0, 1, 1], dtype=int64)

In [353]:
irony_train = pd.read_csv("../datasets/irony/train_text.txt", sep="\t", quoting=3, names=["tweets"])
irony_test = pd.read_csv("../datasets/irony/test_text.txt", sep="\t", quoting=3, names=["tweets"])
irony_val = pd.read_csv("../datasets/irony/val_text.txt", sep="\t", quoting=3, names=["tweets"])

In [354]:
irony_test

Unnamed: 0,tweets
0,@user Can U Help?||More conservatives needed o...
1,"Just walked in to #Starbucks and asked for a ""..."
2,#NOT GONNA WIN
3,@user He is exactly that sort of person. Weirdo!
4,So much #sarcasm at work mate 10/10 #boring 10...
...,...
779,"If you drag yesterday into today, your tomorro..."
780,Congrats to my fav @user & her team & my birth...
781,@user Jessica sheds tears at her fan signing e...
782,#Irony: al jazeera is pro Anti - #GamerGate be...


# Accuracy of prediction in diffent datasets

In [355]:
SGD_accuracy('offensive', 'log')

(0.7530211480362538, 0.7988372093023256)

In [356]:
SGD_accuracy('irony', 'log')

(0.6219895287958115, 0.6377551020408163)

In [357]:
SGD_accuracy('stance/abortion', 'log')

(0.6212121212121212, 0.6428571428571429)

In [358]:
SGD_accuracy('stance/atheism', 'log')

(0.7115384615384616, 0.65)

In [359]:
SGD_accuracy('stance/climate', 'log')

(0.725, 0.7100591715976331)

In [360]:
SGD_accuracy('stance/feminist', 'log')

(0.5970149253731343, 0.5473684210526316)

In [361]:
SGD_accuracy('stance/hillary', 'log')

(0.6666666666666666, 0.6067796610169491)