# Data Mining Final Assignment
### Sotiria Pantazi, 1115201700241

In [93]:
import os
import pandas as pd
import string
import nltk
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer 
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from wordcloud import STOPWORDS
from sklearn.feature_extraction import text

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/impermium_verification_set.csv')
labeled_test = pd.read_csv('./data/impermium_verification_labels.csv')

In [3]:
train['Comment'] = train['Comment'].str.lower()
test['Comment'] = test['Comment'].str.lower()

# Data Preprocessing

*Applying on the train dataset*

In [4]:
# source for this code: stackoverflow.com and medium.com
train.Comment = train.Comment.apply(lambda x: re.sub(r'(\\x..)|(\\n)|(\\u....)', ' ', x))   # removing chars like \n etc
train.Comment = train.Comment.apply(lambda x: re.sub(r'(http/http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', x))    # removing urls
train.Comment = train.Comment.apply(lambda x: re.sub(r'[^a-z0-9]+', ' ', x))    # removing any useless chars left
train

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,you fuck your dad
1,0,20120528192215Z,i really don t understand your point it seems...
2,0,,a majority of canadians can and has been wron...
3,0,,listen if you dont wanna get married to a man...
4,0,20120619094753Z,c c b n xu ng ng bi u t nh 2011 c n ho kh ng ...
...,...,...,...
3942,1,20120502172717Z,you are both morons and that is never happening
3943,0,20120528164814Z,many toolbars include spell check like yahoo ...
3944,0,20120620142813Z,lambeauorwrigley k moss sioux falls s d i tol...
3945,0,20120528205648Z,how about felix he is sure turning into one h...


*Applying on the test dataset*

In [5]:
# source for this code: stackoverflow.com and medium.com
test.Comment = train.Comment.apply(lambda x: re.sub(r'(\\x..)|(\\n)|(\\u....)', ' ', x))   # removing chars like \n etc
test.Comment = train.Comment.apply(lambda x: re.sub(r'(http/http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', ' ', x))    # removing urls
test.Comment = train.Comment.apply(lambda x: re.sub(r'[^a-z0-9]+', ' ', x))    # removing any useless chars left
test

Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,,20120603163526Z,you fuck your dad,PrivateTest
1,2,,20120531215447Z,i really don t understand your point it seems...,PrivateTest
2,3,,20120823164228Z,a majority of canadians can and has been wron...,PrivateTest
3,4,,20120826010752Z,listen if you dont wanna get married to a man...,PrivateTest
4,5,,20120602223825Z,c c b n xu ng ng bi u t nh 2011 c n ho kh ng ...,PrivateTest
...,...,...,...,...,...
2230,2231,,20120528100303Z,you is shit,PrivateTest
2231,2232,,20120531185813Z,even if the bucks won the nba they wouldnt ca...,PrivateTest
2232,2233,,20120529130822Z,damn straight my friend if it s not the judge...,PrivateTest
2233,2234,,20120531045826Z,i d rather have an old straight white guy tha...,PrivateTest


# Classification

## Method: Naive Bayes

In [6]:
validation_metrics = ['f1', 'accuracy']

### *Using CountVectorizer - basic*

In [7]:
vect = CountVectorizer() # initializing vectorizer 

In [8]:
basic_train = vect.fit_transform(train.Comment.values)
basic_train = basic_train.toarray()     # creating ndarray with features 

In [9]:
basic_test = vect.transform(test.Comment.values)
basic_test = basic_test.toarray()   # applying the same as above to the test set

In [10]:
basicGNB = GaussianNB()
basicGNB.fit(basic_train, train.Insult.values)    # y is the label (0 or 1) of insults

GaussianNB(priors=None, var_smoothing=1e-09)

In [11]:
basic_scores = cross_validate(estimator = basicGNB, X = basic_test, y = labeled_test.Insult.values, scoring = validation_metrics, cv = 10, n_jobs = 6)
print('F1 mean score:', basic_scores['test_f1'].mean())
print('Accuracy mean score:', basic_scores['test_accuracy'].mean())

F1 mean score: 0.37347561009810304
Accuracy mean score: 0.5011010570147342


### Optimization

*With Lemmatization*

In [12]:
def Lemmatizer(tweets):
    tweetTok = TweetTokenizer()
    wnlemm = WordNetLemmatizer()
    tokenized = []
    for t in tweets:
        tokenized.append(tweetTok.tokenize(t))
    lemmatized = []
    for tok in tokenized:
        lemm_i = []
        for word in tok:
            lemm_i.append(wnlemm.lemmatize(word))
        lemm_i = ' '.join(lemm_i)
        lemmatized.append(lemm_i)
    return lemmatized

In [13]:
vect = CountVectorizer() # initializing vectorizer

In [14]:
lemmatized_train = Lemmatizer(train.Comment.values)

In [15]:
opt1_train = vect.fit_transform(lemmatized_train)
opt1_train = opt1_train.toarray()     # creating ndarray with features 

In [16]:
lemmatized_test = Lemmatizer(test.Comment.values)

In [17]:
opt1_test = vect.transform(lemmatized_test)
opt1_test = opt1_test.toarray()   # applying the same as above to the test set

In [18]:
opt1GNB = GaussianNB()
opt1GNB.fit(opt1_train, train.Insult.values)    # y is the label (0 or 1) of insults

GaussianNB(priors=None, var_smoothing=1e-09)

In [19]:
opt1_scores = cross_validate(estimator = opt1GNB, X = opt1_test, y = labeled_test.Insult.values, scoring = validation_metrics, cv = 10, n_jobs = 6)
print('F1 mean score:', opt1_scores['test_f1'].mean())
print('Accuracy mean score:', opt1_scores['test_accuracy'].mean())

F1 mean score: 0.37110409764149693
Accuracy mean score: 0.504234064702114


*With Stopwords*

In [20]:
vect = CountVectorizer(stop_words = text.ENGLISH_STOP_WORDS) # initializing vectorizer 

In [21]:
opt2_train = vect.fit_transform(train.Comment.values)
opt2_train = opt2_train.toarray()     # creating ndarray with features 

In [22]:
opt2_test = vect.transform(test.Comment.values)
opt2_test = opt2_test.toarray()   # applying the same as above to the test set

In [23]:
opt2GNB = GaussianNB()
opt2GNB.fit(opt2_train, train.Insult.values)    # y is the label (0 or 1) of insults

GaussianNB(priors=None, var_smoothing=1e-09)

In [24]:
opt2_scores = cross_validate(estimator = opt2GNB, X = opt2_test, y = labeled_test.Insult.values, scoring = validation_metrics, cv = 10, n_jobs = 6)
print('F1 mean score:', opt2_scores['test_f1'].mean())
print('Accuracy mean score:', opt2_scores['test_accuracy'].mean())

F1 mean score: 0.36869566998427594
Accuracy mean score: 0.49797005124919924


*With Bigrams*

In [25]:
vect = CountVectorizer(ngram_range = (2,2)) # initializing vectorizer 

In [26]:
opt3_train = vect.fit_transform(train.Comment.values)
opt3_train = opt3_train.toarray()     # creating ndarray with features 

In [27]:
opt3_test = vect.transform(test.Comment.values)
opt3_test = opt3_test.toarray()   # applying the same as above to the test set

In [28]:
opt3GNB = GaussianNB()
opt3GNB.fit(opt3_train, train.Insult.values)    # y is the label (0 or 1) of insults

GaussianNB(priors=None, var_smoothing=1e-09)

In [29]:
opt3_scores = cross_validate(estimator = opt3GNB, X = opt3_test, y = labeled_test.Insult.values, scoring = validation_metrics, cv = 10, n_jobs = 6)
print('F1 mean score:', opt3_scores['test_f1'].mean())
print('Accuracy mean score:', opt3_scores['test_accuracy'].mean())

F1 mean score: 0.4406924085305258
Accuracy mean score: 0.5158872517616914


*With Laplace Smoothing*

In [30]:
vect = CountVectorizer() # initializing vectorizer 

In [39]:
#MultinomialNB will be used since it provides the coefficient alpha for smoothing
opt4MNB = MultinomialNB(alpha = 0.5)   # the basic alpha here will be 0.5
opt4MNB.fit(basic_train, train.Insult.values)    # y is the label (0 or 1) of insults

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [40]:
opt4_scores = cross_validate(estimator = opt4MNB, X = basic_test, y = labeled_test.Insult.values, scoring = validation_metrics, cv = 10, n_jobs = 6)
print('F1 mean score:', opt4_scores['test_f1'].mean())
print('Accuracy mean score:', opt4_scores['test_accuracy'].mean())

F1 mean score: 0.4521485869077558
Accuracy mean score: 0.49664678090967324


### *Using TF-IDF and PoS*

*Part-of-Speech based features*

In [71]:
# the source of this process's algorithm was found on stackoveflow.com
def PoS_processing(tweets):  
    tokzer = TweetTokenizer()
    new_tweets = []
    for tweet in tweets:
        text = tokzer.tokenize(tweet)
        tagged = nltk.pos_tag(text)
        processed_tagged = []
        for tag in tagged:
            processed_tagged.append(tag[0] + "/" + tag[1])
        new_tweets.append(' '.join(processed_tagged))
    return new_tweets

*TF-IDF applied on extracted PoS features*

In [72]:
pos_train = PoS_processing(train.Comment.values)
pos_test = PoS_processing(test.Comment.values)

In [73]:
tfidf = TfidfVectorizer()   
tf_train = tfidf.fit_transform(pos_train)
tf_train = tf_train.toarray()

In [75]:
tf_test = tfidf.transform(pos_test)
tf_test = tf_test.toarray()

## Method: Naive Bayes

In [77]:
comboGNB = GaussianNB()
comboGNB.fit(tf_train, train.Insult.values)    # y is the label (0 or 1) of insults

GaussianNB(priors=None, var_smoothing=1e-09)

In [78]:
combo_scores = cross_validate(estimator = comboGNB, X = tf_test, y = labeled_test.Insult.values, scoring = validation_metrics, cv = 10, n_jobs = 6)
print('F1 mean score:', combo_scores['test_f1'].mean())
print('Accuracy mean score:', combo_scores['test_accuracy'].mean())

F1 mean score: 0.42276774672458506
Accuracy mean score: 0.48990831197950035


## Method: Random Decision Forest

In [91]:
forest = RandomForestClassifier(max_depth=16,n_jobs=4)
forest.fit(tf_train, y = train.Insult.values)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=16, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [92]:
forest_scores = cross_validate(estimator = forest, X = tf_test, y = labeled_test.Insult.values, scoring = validation_metrics, cv = 10, n_jobs = 6)
print('F1 mean score:', forest_scores['test_f1'].mean())
print('Accuracy mean score:', forest_scores['test_accuracy'].mean())

F1 mean score: 0.3158875539792419
Accuracy mean score: 0.5123178251121077


## Method: Support Vector Machine

In [94]:
clf = SVC(kernel = 'rbf', C = 1)
clf.fit(tf_train, y = train.Insult.values)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [95]:
svm_scores = cross_validate(estimator = clf, X = tf_test, y = labeled_test.Insult.values, scoring = validation_metrics, cv = 10, n_jobs = 6)
print('F1 mean score:', svm_scores['test_f1'].mean())
print('Accuracy mean score:', svm_scores['test_accuracy'].mean())

F1 mean score: 0.37165013398480234
Accuracy mean score: 0.482361066623959
