In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.externals import joblib
from my_measures import BinaryClassificationPerformance
%matplotlib inline

### read and summarize data

In [3]:
f = './toxiccomments_sample_submission.csv'
toxic_data = pd.read_csv(f)

In [4]:
print("toxic_data is:", type(toxic_data))
print("toxic_data has", toxic_data.shape[0], "rows and", toxic_data.shape[1], "columns", "\n")
print("the data types for each of the columns in toxic_data:")
print(toxic_data.dtypes, "\n")
print("the first 10 rows in toxic_data:")
print(toxic_data.head(5))

toxic_data is: <class 'pandas.core.frame.DataFrame'>
toxic_data has 153164 rows and 2 columns 

the data types for each of the columns in toxic_data:
id            object
prediction      bool
dtype: object 

the first 10 rows in toxic_data:
                 id  prediction
0  00001cee341fdb12        True
1  0000247867823ef7        True
2  00013b17ad220c46       False
3  00017563c3f7919a        True
4  00017695ad8997eb       False


In [5]:
print("The rate of toxic comments in the dataset: ")
print(toxic_data['toxic'].mean())

The rate of toxic comments in the dataset: 


KeyError: 'toxic'

### Feature extraction on natural language data

In [6]:
# # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()
# corpus = toxic_data.comment_text.as_matrix()
# X_bag_of_words = vectorizer.fit_transform(corpus)
# print(X_bag_of_words.toarray())

In [7]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=2 ** 17, non_negative=True)
X_hv = hv.fit_transform(toxic_data.comment_text)
print(X_hv.shape)

AttributeError: 'DataFrame' object has no attribute 'comment_text'

In [8]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X_hv)

NameError: name 'X_hv' is not defined

In [9]:
print(type(X_tfidf))

NameError: name 'X_tfidf' is not defined

### Create additional quantitative features

In [10]:
# features from Amazon.csv to add to feature set
toxic_data['word_count'] = toxic_data['comment_text'].str.split(' ').str.len()
toxic_data['punc_count'] = toxic_data['comment_text'].str.count("\.")


X_quant_features = toxic_data[["word_count", "punc_count"]]
print(X_quant_features.head(10))
print(type(X_quant_features))

KeyError: 'comment_text'

### Combine all quantitative features into a single sparse matrix

In [11]:
from scipy.sparse import csr_matrix, hstack
X_quant_features_csr = csr_matrix(X_quant_features)
X_combined = hstack([X_tfidf, X_quant_features_csr])
X_matrix = csr_matrix(X_combined) # convert to sparse matrix
print(X_matrix.shape)

NameError: name 'X_quant_features' is not defined

In [12]:
# look at an example of a "row" of a sparse matrix
print(X_matrix[1234])

NameError: name 'X_matrix' is not defined

### Create `X`, scaled matrix of features

In [13]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X_matrix)
print(X.shape)

NameError: name 'X_matrix' is not defined

In [14]:
# look at an example of a "row" of a sparse matrix, after scaling
print(X[1234])

NameError: name 'X' is not defined

# Create Training and Test Sets

In [15]:
# create training and test sets
from sklearn.model_selection import train_test_split

# enter an integer for the random_state parameter; any integer will work
X_train, X_test, y_train, y_test, X_raw_train, X_raw_test = train_test_split(X, toxic_data['toxic'], toxic_data, test_size= 0.2, random_state=)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(X_raw_train.shape)
print(X_raw_test.shape)

SyntaxError: invalid syntax (<ipython-input-15-4a064e8a9880>, line 5)

# Fit Models

### MODEL: ordinary least squares

In [16]:
from sklearn import linear_model
ols = linear_model.SGDClassifier(loss="squared_loss")
ols.fit(X_train, y_train)

ols_performance_train = BinaryClassificationPerformance(ols.predict(X_train), y_train, 'ols_train')
ols_performance_train.compute_measures()
print(ols_performance_train.performance_measures)

NameError: name 'X_train' is not defined

### MODEL: SVM, linear

In [17]:
from sklearn import linear_model
svm = linear_model.SGDClassifier()
svm.fit(X_train, y_train)

svm_performance_train = BinaryClassificationPerformance(svm.predict(X_train), y_train, 'svm_train')
svm_performance_train.compute_measures()
print(svm_performance_train.performance_measures)

NameError: name 'X_train' is not defined

### MODEL: logistic regression

In [18]:
from sklearn import linear_model
lgs = linear_model.SGDClassifier(loss='log', n_iter=50, alpha=0.00001)
lgs.fit(X_train, y_train)

lgs_performance_train = BinaryClassificationPerformance(lgs.predict(X_train), y_train, 'lgs_train')
lgs_performance_train.compute_measures()
print(lgs_performance_train.performance_measures)

NameError: name 'X_train' is not defined

### MODEL: Naive Bayes

In [19]:
from sklearn.naive_bayes import MultinomialNB
nbs = MultinomialNB()
nbs.fit(X_train, y_train)

nbs_performance_train = BinaryClassificationPerformance(nbs.predict(X_train), y_train, 'nbs_train')
nbs_performance_train.compute_measures()
print(nbs_performance_train.performance_measures)

NameError: name 'X_train' is not defined

### MODEL: Perceptron

In [20]:
from sklearn import linear_model
prc = linear_model.SGDClassifier(loss='perceptron')
prc.fit(X_train, y_train)

prc_performance_train = BinaryClassificationPerformance(prc.predict(X_train), y_train, 'prc_train')
prc_performance_train.compute_measures()
print(prc_performance_train.performance_measures)

NameError: name 'X_train' is not defined

### MODEL: Ridge Regression Classifier

In [21]:
from sklearn import linear_model
rdg = linear_model.RidgeClassifier()
rdg.fit(X_train, y_train)

rdg_performance_train = BinaryClassificationPerformance(rdg.predict(X_train), y_train, 'rdg_train')
rdg_performance_train.compute_measures()
print(rdg_performance_train.performance_measures)

NameError: name 'X_train' is not defined

### ROC plot to compare performance of various models and fits

In [22]:
fits = [ols_performance_train, svm_performance_train, lgs_performance_train, nbs_performance_train, prc_performance_train, rdg_performance_train]

for fit in fits:
    plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'bo')
    plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
plt.axis([0, 1, 0, 1])
plt.title('ROC plot: test set')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()

NameError: name 'ols_performance_train' is not defined

### looking at reviews based on their classification

Let's say we decide that Ridge Regression is the best model for generalization. Let's take a look at some of the reviews and try to make a (subjective) determination of whether it's generalizing well. 

In [23]:
toxic_data.loc[0, "comment_text"]

KeyError: 'the label [comment_text] is not in the [index]'

In [24]:
ridge_predictions = rdg.predict(X_train)

NameError: name 'X_train' is not defined

In [25]:
ridge_predictions.shape

NameError: name 'ridge_predictions' is not defined

In [26]:
# false positives

print("Examples of false positives:")

import random, time

for i in range(0, len(ridge_predictions)):
    if (ridge_predictions[i] == True):
        if (toxic_data.loc[i, "toxic"] == False):
            if (random.uniform(0, 1) < 0.05):
                print(i)
                print(toxic_data.loc[i, "comment_text"])
                print('* * * * * * * * * ')

Examples of false positives:


NameError: name 'ridge_predictions' is not defined

---

# <span style="color:red">WARNING: Don't look at test set performance too much!</span>

---

The following cells show performance on your test set. Do not look at this too often! 

# Look at performance on the test set

### MODEL: ordinary least squares

In [27]:
ols_performance_test = BinaryClassificationPerformance(ols.predict(X_test), y_test, 'ols_test')
ols_performance_test.compute_measures()
print(ols_performance_test.performance_measures)

NameError: name 'X_test' is not defined

### MODEL: SVM, linear

In [28]:
svm_performance_test = BinaryClassificationPerformance(svm.predict(X_test), y_test, 'svm_test')
svm_performance_test.compute_measures()
print(svm_performance_test.performance_measures)

NameError: name 'X_test' is not defined

### MODEL: logistic regression

In [29]:
lgs_performance_test = BinaryClassificationPerformance(lgs.predict(X_test), y_test, 'lgs_test')
lgs_performance_test.compute_measures()
print(lgs_performance_test.performance_measures)

NameError: name 'X_test' is not defined

### MODEL: Naive Bayes

In [30]:
nbs_performance_test = BinaryClassificationPerformance(nbs.predict(X_test), y_test, 'nbs_test')
nbs_performance_test.compute_measures()
print(nbs_performance_test.performance_measures)

NameError: name 'X_test' is not defined

### MODEL: Perceptron

In [31]:
prc_performance_test = BinaryClassificationPerformance(prc.predict(X_test), y_test, 'prc_test')
prc_performance_test.compute_measures()
print(prc_performance_test.performance_measures)

NameError: name 'X_test' is not defined

### MODEL: Ridge Regression Classifier

In [32]:
rdg_performance_test = BinaryClassificationPerformance(rdg.predict(X_test), y_test, 'rdg_test')
rdg_performance_test.compute_measures()
print(rdg_performance_test.performance_measures)

NameError: name 'X_test' is not defined

### ROC plot to compare performance of various models and fits

In [33]:
fits = [ols_performance_test, svm_performance_test, lgs_performance_test, nbs_performance_test, prc_performance_test, rdg_performance_test]

for fit in fits:
    plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'bo')
    plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
plt.axis([0, 1, 0, 1])
plt.title('ROC plot: test set')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()

NameError: name 'ols_performance_test' is not defined