# YouTube Comments NLP Notebook

In [4]:
# basics
import pandas as pd
import os
import csv
import numpy as np

## sklearn
import sklearn # machine learning
from sklearn.feature_extraction.text import CountVectorizer # frequency counts matrix
from sklearn.model_selection import train_test_split # splitting up data
from sklearn import metrics # for accuracy/ precision
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier # Support Vector Machine Classifier
# multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification) 

## Read in & Clean Data
Note that we have to use encoding = "latin-1" instead of UTF-8 because we have foreign languages present
The different encodings treat characters differently (in latin-1 each character is only one byte long whereas in utf-8 it can me more than one byte in length). Typically utf-8 captures more types of characters, so it was surprising that we had to use latin-8. 
* for later: look into this more: http://www.unicode.org/reports/tr10/

In [5]:
#os.chdir('/Users/andiedonovan/myProjects/Youtube/') # change directory
df = pd.read_csv('labeledCom.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') # read in the data

In [6]:
# rename the columns
df.columns = [
  'label',
  'comment', 
  'column3'
]
df = df.drop('column3', axis = 1).dropna() # drop column 3 and missing values
print(df.head(5))

   label                                            comment
0   -1.0  Everyone knows brand's papers from.\nBut -No o...
1    0.0       ñYour paper cut balance is: \n-£25279102771î
2    1.0  OH SHIT WHEN I SAW THIS ON MY FRONT PAGE.........
3    1.0                          Blowing my mind yet again
4    0.0               Should have gone with Dunder Mifflin


## Split into Training and Test Data
Using a pre-defined train-test-split function, we randomly split the data into training data (75%) and test data (25%). We set the x variable for both to the comments, since these are the attributes we will use for classificationand the y variable to the label, as this is what we are trying to predict. The random_state paramteter is simply for reproducability (otherwise the function would produce a different split every time we ran it). 

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(
                                        df["comment"], df["label"], 
                                        test_size=0.25, 
                                       random_state=42)

Let's make sure all of the data looks good:

In [8]:
print('lengths training variables: ', len(X_train),",", len(Y_train))
print('lengths testing variables: ', len(X_test),",", len(Y_test), '\n')

print('Are there any missing values?', 
      '\n * Training:', pd.isnull(X_train).values.any(), ',', pd.isnull(Y_train).values.any(), 
      '\n * Testing: ', pd.isnull(X_test).values.any(), ",", pd.isnull(Y_test).values.any())

lengths training variables:  820 , 820
lengths testing variables:  274 , 274 

Are there any missing values? 
 * Training: False , False 
 * Testing:  False , False


In [9]:
type(X_test) # we have a pandas core Series; we just want the comments in an array without numbering
# help(X_test) # use values attribute
# we will want to use X_test.values(), Y_train.values(), .... to just access the data in list format

pandas.core.series.Series

## Building a Model
Documentation: [Scikit-Learn Documentation]('http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#exercise-2-sentiment-analysis-on-movie-reviews')

We want to initialize a Count Vectorizer, which will convert the comments to a matrix of token (word) counts. This produces a sparse representation of the counts
We then fit the model using our training data

In [10]:
cv = CountVectorizer() 
x_train_counts = cv.fit_transform(X_train.values) # fit_transform to counts

In [11]:
type(x_train_counts) # scipy.sparse.csr.csr_matrix 

scipy.sparse.csr.csr_matrix

In [118]:
# tf_transformer = TfidfTransformer(use_idf=False).fit(x_train_counts) # term frequency inverse document frequency matrix
# I think this is a different way of transforming our data to counts, instead of just using 'transform'

### Transform test values as well:

In [12]:
x_test_counts = cv.transform(X_test.values) # transform test data as well (but we don't need to train it since its test data!)

### Initializing the Classifier: 

In [15]:
mnb = MultinomialNB()
mnb.fit(x_train_counts, Y_train) # fit the model on the training data word counts and training data lables

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

###  Making the predictions:

In [16]:
predicted_labels = mnb.predict(x_test_counts) # make our y predictions (labels) on the comment test data

In [17]:
for i in predicted_labels[:10]:
    print (i)

-1.0
0.0
0.0
1.0
1.0
1.0
0.0
-1.0
1.0
-1.0


In [112]:
# pd.DataFrame(predicted_labels).groupby(['-1.0','0.0','1.0']).size()

### How well did we do?? (AKA accuracy metrics)

In [18]:
acc = metrics.accuracy_score(Y_test, predicted_labels)

In [19]:
print('We obtained ', round(acc, 4), '% accuracy for the model!')

We obtained  0.5949 % accuracy for the model!


In [128]:
# np.mean(predicted_labels == Y_test)  # same scoring method

In [20]:
print('Here is the Classification Report: \n')
print(metrics.classification_report(Y_test, predicted_labels))

Here is the Classification Report: 

             precision    recall  f1-score   support

       -1.0       0.59      0.46      0.52        83
        0.0       0.57      0.47      0.51        91
        1.0       0.61      0.82      0.70       100

avg / total       0.59      0.59      0.58       274



In [21]:
print('Here is the Confusion Matrix: \n')
metrics.confusion_matrix(Y_test, predicted_labels)

Here is the Confusion Matrix: 



array([[38, 19, 26],
       [22, 43, 26],
       [ 4, 14, 82]])

## Up Next: SVM Classifier, TF-IDF transformations, etc.

In [45]:
from sklearn.pipeline import Pipeline
from sklearn import svm

### SVM

In [53]:
svm_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', svm.SVC())
                   ])
svm_clf.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [54]:
svm_predicted = svm_clf.predict(X_test)
np.mean(svm_predicted == Y_test)

0.36496350364963503

### SVM with Stochastic Gradient Descent

In [46]:
SGD_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                          alpha=1e-3, random_state=42,
                                          max_iter=5, tol=None))])

SGD_clf.fit(X_train, Y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [47]:
SGD_predicted = SGD_clf.predict(X_test)
np.mean(SGD_predicted == Y_test)

0.62408759124087587

### Neural Network - Multi-layer Perceptron classifier

In [55]:
NN_clf = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('chi2', SelectKBest(chi2, k = 'all')),
                            ('clf', MLPClassifier(
                                    hidden_layer_sizes=(100,), 
                                    max_iter=10, 
                                    alpha=1e-4,
                                    solver='sgd', 
                                    verbose=10, 
                                    tol=1e-4, 
                                    random_state=1,
                                    learning_rate_init=.1)),
                            ])

NN_clf.fit(X_train, Y_train)

NameError: name 'SelectKBest' is not defined