In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# set env
MODEL_VERSION = 'a'
PATH ='/content/drive/MyDrive/Colab Notebooks/clean_train_tweets.csv'
train_tweets=pd.read_csv(PATH)
train_tweets

Unnamed: 0,id,label,tweet,length,count
0,1,0,father dysfunctional selfish drags kids dysfun...,55,7
1,2,0,thanks lyft credit use cause offer wheelchair ...,77,11
2,3,0,bihday majesty,14,2
3,4,0,model love u take u time ur,27,7
4,5,0,factsguide society motivation,29,3
...,...,...,...,...,...
31925,31958,0,ate isz youuu,13,3
31926,31959,0,see nina turner airwaves trying wrap mantle ge...,93,14
31927,31960,0,listening sad songs monday morning otw work sad,47,8
31928,31961,1,sikh temple vandalised calgary wso condemns act,47,7


### Splitting the train dataset into train and development

In [4]:
#to test perfomarnce against the development set, we can split the training dataset into train and dev

from sklearn.model_selection import train_test_split

In [5]:
#15% of train_tweets will be in dev
train, dev= train_test_split(train_tweets, test_size=0.15, random_state=42)

### CountVectorizer

In [6]:
#using scikit-learn to transform text into token count vector

from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer(
    token_pattern = r"[a-z]+", 
    ngram_range = (1,1), #ngram_range (1,1)= only unigrams, (1,2)=unigrams and bigrams, (2,2)=bigrams
    lowercase = True,
    min_df = 1, #min_df=1 is the default, means ignore terms that appear in less than 1 document/text.
    max_df = 1.0 #max_df=1.0 is the default, means ignore terms that appear in more than 100% of the documents/texts.
)

In [7]:
X_train = train['tweet']
X_train_vect = count_vector.fit_transform(X_train) #fitting CountVectorizer, transforms trainging data into 
                                                    #matrix representing token counts 
X_train_vect

<27140x34039 sparse matrix of type '<class 'numpy.int64'>'
	with 200746 stored elements in Compressed Sparse Row format>

In [8]:
X_train

5561     excited saturday fake festival sister bihday s...
5500     folks repoing violence france euro 2016 though...
25832                         remaster remaster everywhere
4480                            happy positive affirmation
18879    inspired grateful aists painters creative peop...
                               ...                        
29802    ahhh hea breaks really seemed like keeper bach...
5390     bread x one million loaves bread math carbs li...
860      black professor makes assumptions entire race ...
15795                launch new restaurant tuesday buzzing
23654    war sucks deathbattle fightfor 15 bombers kill...
Name: tweet, Length: 27140, dtype: object

In [9]:
y_train = train['label']

In [10]:
X_dev = dev['tweet']
X_dev_vect = count_vector.transform(X_dev) # Note that the vectorizer is already fit, so we only use the transform method.
y_dev = dev['label']

#setting up a PredefinedSplit

X_train = train['tweet']
y_train = train['label']

X_dev = dev['tweet']
y_dev = dev['label']

X = np.hstack([X_train, X_dev])
y = np.hstack([y_train, y_dev])

#assign 0 to items that are in dev and -1 for the rest
split_train_dev= np.zeros(shape=y.shape)
split_train_dev[:y_train.shape[0]] = -1
pd.value_counts(split_train_dev)

In [11]:
X_train=pd.DataFrame(X_train)
X_train

Unnamed: 0,tweet
5561,excited saturday fake festival sister bihday s...
5500,folks repoing violence france euro 2016 though...
25832,remaster remaster everywhere
4480,happy positive affirmation
18879,inspired grateful aists painters creative peop...
...,...
29802,ahhh hea breaks really seemed like keeper bach...
5390,bread x one million loaves bread math carbs li...
860,black professor makes assumptions entire race ...
15795,launch new restaurant tuesday buzzing


In [12]:
from nltk.tokenize import word_tokenize, regexp_tokenize

#tokenizing tweets in X_train
X_train=[[x for x in regexp_tokenize(sentence.lower(), pattern = r"[a-z]+")] for sentence in X_train['tweet'].tolist()]

In [13]:
X_train

[['excited',
  'saturday',
  'fake',
  'festival',
  'sister',
  'bihday',
  'sis',
  'mates',
  'mate',
  'mom'],
 ['folks',
  'repoing',
  'violence',
  'france',
  'euro',
  'though',
  'caused',
  'cameron',
  'kinsmen',
  'may',
  'even',
  'staed'],
 ['remaster', 'remaster', 'everywhere'],
 ['happy', 'positive', 'affirmation'],
 ['inspired', 'grateful', 'aists', 'painters', 'creative', 'people', 'meme'],
 ['break',
  'bad',
  'habit',
  'success',
  'courage',
  'education',
  'choices',
  'cope'],
 ['day',
  'america',
  'sad',
  'lost',
  'lives',
  'sad',
  'families',
  'sad',
  'obama',
  'happen',
  'islam',
  'orlando',
  'tcot'],
 ['motherfucker',
  'k',
  'tweets',
  'one',
  'month',
  'even',
  'real',
  'person',
  'one'],
 ['saturday',
  'dance',
  'classes',
  'today',
  'amazing',
  'made',
  'drink',
  'mug',
  'danceteacher',
  'muscles',
  'ache'],
 ['versailles', 'men', 'look', 'like', 'early', 's', 'throwbacks', 'days'],
 ['couple', 'knew', 'together', 'foreve

In [14]:
tokens = sorted(set([x for sentence in X_train for x in sentence]))

In [15]:
word_token = {word:i + 1 for i, word in enumerate(tokens)}
token_word = {i + 1:word for i, word in enumerate(tokens)}

vocab_size = len(word_token) + 1

In [16]:
import tensorflow as tf

In [17]:
X_train = [[word_token[word] for word in sentence] for sentence in X_train]
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen = 100)

In [18]:
X_train

array([[    0,     0,     0, ..., 18518, 18508, 19360],
       [    0,     0,     0, ..., 18557,  9596, 28207],
       [    0,     0,     0, ..., 24910, 24910,  9639],
       ...,
       [    0,     0,     0, ..., 20568, 32608, 15651],
       [    0,     0,     0, ..., 25098, 31089,  4331],
       [    0,     0,     0, ..., 33510,  3190, 22362]], dtype=int32)

In [19]:
X_dev=pd.DataFrame(X_dev)
X_dev

Unnamed: 0,tweet
29583,tried wedding suit 1st time today much
2153,sent father day gifts today fathersdaygiftidea...
18511,bangkok got 7 shopping cantwaittoseegot 7 omg
1668,lamp head see fragile saw li
10092,3 weeks till interrailing
...,...
22226,kwon soon young born day
18515,realitycheck policing america cop audiblechann...
4749,hard believe live world 700 pa hillary thread ...
31290,father day love u papa


In [20]:
X_dev=[[x for x in regexp_tokenize(sentence.lower(), pattern = r"[a-z]+")] for sentence in X_dev['tweet'].tolist()]

In [21]:
X_dev

[['tried', 'wedding', 'suit', 'st', 'time', 'today', 'much'],
 ['sent',
  'father',
  'day',
  'gifts',
  'today',
  'fathersdaygiftideas',
  'mydadmyhero',
  'fathersday'],
 ['bangkok', 'got', 'shopping', 'cantwaittoseegot', 'omg'],
 ['lamp', 'head', 'see', 'fragile', 'saw', 'li'],
 ['weeks', 'till', 'interrailing'],
 ['ugh', 'day', 'things', 'going', 'feel', 'easier'],
 ['day', 'nutella', 'shit', 'girls', 'instapic'],
 ['doubt',
  'always',
  'see',
  'famous',
  'actor',
  'robe',
  'pattinson',
  'person',
  'prsucks'],
 ['people',
  'like',
  'already',
  'forgot',
  'nah',
  'new',
  'names',
  'made',
  'hashtag',
  'weekly',
  'basis'],
 ['dont',
  'want',
  'world',
  'like',
  'homophonia',
  'phobie',
  'asshole',
  'praygay',
  'orlando',
  'prayfororlando'],
 ['sister', 'never', 'love', 'friends', 'bestfriends', 'cute', 'girl'],
 ['kill',
  'xd',
  'snapchat',
  'snap',
  'dog',
  'thebestoftheday',
  'friki',
  'otaku',
  'dumb',
  'friends'],
 ['taiwan',
  'president',
 

In [22]:
tokens = sorted(set([x for sentence in X_dev for x in sentence]))

In [23]:
word_token = {word:i + 1 for i, word in enumerate(tokens)}
token_word = {i + 1:word for i, word in enumerate(tokens)}

vocab_size = len(word_token) + 1

In [24]:
X_dev = [[word_token[word] for word in sentence] for sentence in X_dev]
X_dev = tf.keras.preprocessing.sequence.pad_sequences(X_dev, maxlen = 100)

X_dev

array([[    0,     0,     0, ...,  9741,  9774,  6399],
       [    0,     0,     0, ...,  3342,  6448,  3341],
       [    0,     0,     0, ...,  8625,  1481,  6879],
       ...,
       [    0,     0,     0, ...,  9919,  9799,  4301],
       [    0,     0,     0, ...,  5743, 10049,  7071],
       [    0,     0,     0, ..., 10879,  2379,  6231]], dtype=int32)

### Decision Trees

In [25]:
from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier()
clf=clf.fit(X_train, y_train)

In [26]:
from sklearn.utils import class_weight 

<module 'sklearn.utils.class_weight' from '/usr/local/lib/python3.7/dist-packages/sklearn/utils/class_weight.py'>

In [27]:
clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [28]:
X_dev

array([[    0,     0,     0, ...,  9741,  9774,  6399],
       [    0,     0,     0, ...,  3342,  6448,  3341],
       [    0,     0,     0, ...,  8625,  1481,  6879],
       ...,
       [    0,     0,     0, ...,  9919,  9799,  4301],
       [    0,     0,     0, ...,  5743, 10049,  7071],
       [    0,     0,     0, ..., 10879,  2379,  6231]], dtype=int32)

In [29]:
predictions=clf.predict(X_dev)
predictions

array([0, 0, 1, ..., 0, 0, 0])

In [30]:
#probablity for each class

#[1., 0.]/[0.95, 0.04] means that, we predict the tweet is class 0 with 1/.95 out of 1 chance, and class 1 with 0/0.04 out of 1 chance
clf.predict_proba(X_dev)

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [31]:
from sklearn.metrics import accuracy_score 

accuracy_score(y_dev, predictions)

0.8870563674321503

In [32]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_dev, predictions, labels=[0,1])

array([[4229,  198],
       [ 343,   20]])

In [33]:
from sklearn.metrics import precision_score
precision_score(y_dev, predictions)

0.09174311926605505

In [34]:
from sklearn.metrics import recall_score

recall_score(y_dev, predictions)

0.05509641873278237

In [35]:
from sklearn.metrics import classification_report 

print(classification_report(y_dev, predictions, target_names=['non_offensive', 'offensive']))

               precision    recall  f1-score   support

non_offensive       0.92      0.96      0.94      4427
    offensive       0.09      0.06      0.07       363

     accuracy                           0.89      4790
    macro avg       0.51      0.51      0.50      4790
 weighted avg       0.86      0.89      0.87      4790



In [39]:
clf.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     