## Data Modeling 
Import .csv file in order for modelling to happen.

In [7]:
import pandas as pd

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

In [19]:
data = pd.read_csv('train_engineered.csv')
data.head()

# remove NaN comments from dataframe
data = data.dropna(how = 'any')

### Split into Testing & Training

Split data after all features 'required' have been added as additional columns.

In [36]:
# Split into train & validation sets
# For word embeddings, explicitly provide x = comments, target = label

# only consider 'toxic' as the traget variable for now
train_t = data.drop(['severe_toxic','obscene', 'threat', 'insult', 'identity_hate'], axis = 1)

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_t['comment_text'], train_t['toxic'])

# label encode the target variable (to feed into the algorithm)
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [37]:
train_t.head()

Unnamed: 0,id,comment_text,toxic,word_count,avg_word,mentions,uppercase,stop_words,sentiment
0,0000997932d777bf,explanation edits made username hardcore metal...,0,42,5,0,2,14,0.136364
1,000103f0d9cfb60f,daww match background colour im seemingly stuc...,0,18,5,0,1,1,0.2
2,000113f07ec002fd,hey man im really trying edit war guy constant...,0,42,4,0,0,18,0.15
3,0001b41b1c6bb37e,cant make real suggestion improvement wondered...,0,112,4,0,5,49,0.183333
4,0001d958c54c6e35,sir hero chance remember page thats,0,13,4,0,0,4,0.0


### Feature Engineering

First focus is to wrod-embed our dataset. By doing so, we are able to convert out text to 'number' equivalents.
(Map word to Vectors).

This will be explored by Frequency X, broken down as follows:
- Count Vector
- TF-IDF Vector
- (Co-Occurance)

In [38]:
# Count Vector

count_vector = CountVectorizer(analyzer = 'word', token_pattern = r'\w{1,}')
count_vector.fit(data['comment_text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vector.transform(train_x)
xvalid_count =  count_vector.transform(valid_x)

In [39]:
count_vector

CountVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [40]:
xtrain_count

<119664x237008 sparse matrix of type '<type 'numpy.int64'>'
	with 3316977 stored elements in Compressed Sparse Row format>

In [42]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer= 'word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(data['comment_text'])

# transform the training and validation data using count vectorizer object
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

tfidf_vect

TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [43]:
xtrain_tfidf

# 2 elements doesn't seem right?

<119664x5000 sparse matrix of type '<type 'numpy.float64'>'
	with 2716659 stored elements in Compressed Sparse Row format>

### Model Building

In [33]:
# In an effort to build a utility function that will train a model based on inputs.

def train_model(classifier, train_set, target, valid_set):
    # fit the training data on classifier
    classifier.fit(train_set, target)
    
    # predict on validation set
    predictions = classifier.predict(valid_set)
    
    # predict on test set & save results (csv)
    
    return metrics.accuracy_score(predictions, valid_y)

#### Naive Bayes

In [44]:
# on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print "Naive Bayes - Count Vector: ", accuracy

# on TF-IDF vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print "Naive Bayes - TF-IDF: ", accuracy

Naive Bayes - Count Vector:  0.942388688327
Naive Bayes - TF-IDF:  0.951940433213


#### Logistic Regression

In [46]:
# on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print "Logistic Regression - Count Vector: ", accuracy

accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print "Logistic Regression - TF-IDF: ", accuracy

Logistic Regression - Count Vector:  0.95326915363
Logistic Regression - TF-IDF:  0.958558965102


#### Normal Logistic Regression

In [69]:
data_vars = data.drop(['id','comment_text','toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate'], axis = 1)
target = data['toxic']

x_train, x_test, y_train, y_test = model_selection.train_test_split(data_vars, target, test_size=0.2)

logit = linear_model.LogisticRegression()
logit.fit(x_train, y_train)

predictions = logit.predict(x_test)

In [73]:
predictions

accuracy = logit.score(x_test, y_test)
print "Logistic Regression: ", accuracy

Logistic Regression:  0.907430039798


In [76]:
# Help here?

pd.concat([data['id'], pd.DataFrame(predictions)])

Unnamed: 0,0
0,0000997932d777bf
1,000103f0d9cfb60f
2,000113f07ec002fd
3,0001b41b1c6bb37e
4,0001d958c54c6e35
5,00025465d4725e87
6,0002bcb3da6cb337
7,00031b1e95af7921
8,00037261f536c51d
9,00040093b2687caa
