In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

# Working with Text Data

<img src="figures/bag_of_words.svg" width=100%>

In [None]:
import pandas as pd
import os

data = pd.read_csv(os.path.join("data", "train.csv"))

In [None]:
len(data)

In [None]:
data

In [None]:
y_train = np.array(data.Insult)

In [None]:
y_train

In [None]:
text_train = data.Comment.tolist()

In [None]:
text_train[6]

In [None]:
data_test = pd.read_csv(os.path.join("data", "test_with_solutions.csv"))

In [None]:
text_test, y_test = data_test.Comment.tolist(), np.array(data_test.Insult)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()
cv.fit(text_train)

In [None]:
len(cv.vocabulary_)

In [None]:
print(cv.get_feature_names()[:50])
print(cv.get_feature_names()[-50:])

In [None]:
X_train = cv.transform(text_train)

In [None]:
X_train

In [None]:
text_train[6]

In [None]:
X_train[6, :].nonzero()[1]

In [None]:
X_test = cv.transform(text_test)

In [None]:
from sklearn.svm import LinearSVC
svm = LinearSVC()

In [None]:
svm.fit(X_train, y_train)

In [None]:
svm.score(X_train, y_train)

In [None]:
svm.score(X_test, y_test)

In [None]:
def visualize_coefficients(classifier, feature_names, n_top_features=25):
    # get coefficients with large absolute values 
    coef = classifier.coef_.ravel()
    positive_coefficients = np.argsort(coef)[-n_top_features:]
    negative_coefficients = np.argsort(coef)[:n_top_features]
    interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])
    # plot them
    plt.figure(figsize=(15, 5))
    colors = ["red" if c < 0 else "blue" for c in coef[interesting_coefficients]]
    plt.bar(np.arange(50), coef[interesting_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha="right");


In [None]:
visualize_coefficients(svm, cv.get_feature_names())

# Exercises
* Create a pipeine using the count vectorizer and SVM (see 07). Train and score using the pipeline.
* Vary the n_gram_range in the count vectorizer, visualize the changed coefficients.
* Grid search the C in the LinearSVC using the pipeline.
* Grid search the C in the LinearSVC together with the n_gram_range (try (1,1), (1, 2), (2, 2))

In [None]:
# %load solutions/text_pipeline.py
