#### Loading required libraries

In [None]:
import re
import string

import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 

In [None]:
pd.set_option('display.max_colwidth', -1)

  """Entry point for launching an IPython kernel.


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

### Loading data

In [None]:
df = pd.read_csv('train.csv')
df.head(10)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo
6,7,0,@user camping tomorrow @user @user @user @user @user @user @user dannyâ¦
7,8,0,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl
8,9,0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦
9,10,0,@user @user welcome here ! i'm it's so #gr8 !


In [None]:
df = df.drop('id', axis=1)

In [None]:
df.tail(10)

Unnamed: 0,label,tweet
31952,0,@user you went too far with @user
31953,0,good morning #instagram #shower #water #berlin #berlincitygirl #girl #newyork #zÃ¼rich #genf #bern
31954,0,#holiday bull up: you will dominate your bull and you will direct it whatever you want it to do. when you
31955,0,less than 2 weeks ððð¼ð¹ððµ @user #ibiza#bringiton#mallorca#holidays#summer
31956,0,off fishing tomorrow @user carnt wait first time in 2 years
31957,0,ate @user isz that youuu?ðððððððððâ¤ï¸
31958,0,to see nina turner on the airwaves trying to wrap herself in the mantle of a genuine hero like shirley chisolm. #shame #imwithher
31959,0,listening to sad songs on a monday morning otw to work is sad
31960,1,"@user #sikh #temple vandalised in in #calgary, #wso condemns act"
31961,0,thank you @user for you follow


In [None]:
df['label'].value_counts()

0    29720
1    2242 
Name: label, dtype: int64

### Cleaning and Tokenization

In [None]:
STOP_WORDS = set(stopwords.words('english') + list(string.punctuation) + ['AT_USER','URL'])

def clean_text(text):
 text = re.sub('@[A-Za-z0–9]+', '', text) #Remove @mentions
 text = re.sub('#', '', text) # Remove '#' hash tag
 text = re.sub('RT[\s]+', '', text) # Remove RT
 text = re.sub('https?:\/\/\S+', '', text) # Remove hyperlink
 text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
 text = text.encode('ascii', 'ignore').decode('utf-8') # Remove unicode
 text = word_tokenize(text)

 return [word.lower() for word in text if word.lower() not in STOP_WORDS]

In [None]:
df['tweet'] = df['tweet'].apply(clean_text)

In [None]:
df.head(10)

Unnamed: 0,label,tweet
0,0,"[father, dysfunctional, selfish, drags, kids, dysfunction, run]"
1,0,"[thanks, lyft, credit, cant, use, cause, dont, offer, wheelchair, vans, pdx, disapointed, getthanked]"
2,0,"[bihday, majesty]"
3,0,"[model, love, u, take, u, time, ur]"
4,0,"[factsguide, society, motivation]"
5,0,"[22, huge, fan, fare, big, talking, leave, chaos, pay, disputes, get, allshowandnogo]"
6,0,"[camping, tomorrow, danny]"
7,0,"[next, school, year, year, exams, cant, think, school, exams, hate, imagine, actorslife, revolutionschool, girl]"
8,0,"[love, land, allin, cavs, champions, cleveland, clevelandcavaliers]"
9,0,"[welcome, im, gr8]"


### Lemmatization

In [None]:
from nltk.tag import pos_tag

In [None]:
help(pos_tag)

Help on function pos_tag in module nltk.tag:

pos_tag(tokens, tagset=None, lang='eng')
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.
    
        >>> from nltk.tag import pos_tag
        >>> from nltk.tokenize import word_tokenize
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
        ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
        [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
        ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
    
    NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence.
    
    :param tokens: Sequence of tokens to be tagged
    :type tokens: list(str)
    :param tagset: the tagset to be u

In [None]:
pos_tag(df['tweet'][0])

[('father', 'RBR'),
 ('dysfunctional', 'JJ'),
 ('selfish', 'JJ'),
 ('drags', 'NNS'),
 ('kids', 'NNS'),
 ('dysfunction', 'NN'),
 ('run', 'VBP')]

In [None]:
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

In [None]:
df['tweet'] = df['tweet'].apply(lemmatize_sentence)

In [None]:
df.head(10)

Unnamed: 0,label,tweet
0,0,"[father, dysfunctional, selfish, drag, kid, dysfunction, run]"
1,0,"[thanks, lyft, credit, cant, use, cause, dont, offer, wheelchair, van, pdx, disapointed, getthanked]"
2,0,"[bihday, majesty]"
3,0,"[model, love, u, take, u, time, ur]"
4,0,"[factsguide, society, motivation]"
5,0,"[22, huge, fan, fare, big, talk, leave, chaos, pay, dispute, get, allshowandnogo]"
6,0,"[camp, tomorrow, danny]"
7,0,"[next, school, year, year, exams, cant, think, school, exam, hate, imagine, actorslife, revolutionschool, girl]"
8,0,"[love, land, allin, cavs, champion, cleveland, clevelandcavaliers]"
9,0,"[welcome, im, gr8]"


### Data preparation

In [None]:
df.loc[df['label'] == 0]['tweet']

0        [father, dysfunctional, selfish, drag, kid, dysfunction, run]                                           
1        [thanks, lyft, credit, cant, use, cause, dont, offer, wheelchair, van, pdx, disapointed, getthanked]    
2        [bihday, majesty]                                                                                       
3        [model, love, u, take, u, time, ur]                                                                     
4        [factsguide, society, motivation]                                                                       
                       ...                                                                                       
31956    [fishing, tomorrow, carnt, wait, first, time, 2, year]                                                  
31957    [ate, isz, youuu]                                                                                       
31958    [see, nina, turner, airwave, try, wrap, mantle, genuine, hero, like, shirley, c

In [None]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

In [None]:
import random

positive_tokens_for_model = get_tweets_for_model(df.loc[df['label'] == 0]['tweet'])
negative_tokens_for_model = get_tweets_for_model(df.loc[df['label'] == 1]['tweet'])

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [None]:
train_data[0]

({'chuffed': True, 'get': True, 'never': True, 'one': True, 'school': True},
 'Positive')

In [None]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(20))

Accuracy is: 0.8325054082204951
Most Informative Features
                   bigot = True           Negati : Positi =    118.1 : 1.0
                  racist = True           Negati : Positi =     77.5 : 1.0
                  israel = True           Negati : Positi =     56.8 : 1.0
                  temple = True           Negati : Positi =     56.8 : 1.0
                shepherd = True           Negati : Positi =     56.8 : 1.0
        blacklivesmatter = True           Negati : Positi =     56.8 : 1.0
             southafrica = True           Negati : Positi =     48.1 : 1.0
                   tampa = True           Negati : Positi =     48.1 : 1.0
                  racism = True           Negati : Positi =     43.1 : 1.0
                    rape = True           Negati : Positi =     39.4 : 1.0
                 protest = True           Negati : Positi =     39.4 : 1.0
                    tcot = True           Negati : Positi =     39.4 : 1.0
                 factory = True           

AttributeError: ignored

### Rough work

In [None]:
nltk.download('names')

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


True

In [None]:
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)

In [None]:
labeled_names[0]

('Danna', 'female')

In [None]:
def gender_features(word):
  return {'last_letter': word[-1]}

In [None]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [None]:
featuresets

[({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'l'}, 'female'),
 ({'last_letter': 'r'}, 'male'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 's'}, 'male'),
 ({'last_letter': 'e'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'e'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'd'}, 'male'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'e'}, 'male'),
 ({'last_letter': 's'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 's'}, 'male'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'l