In [0]:
# NLTK Question Extraction Test
import nltk
import json
import random
from pprint import pprint
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from google.colab import files

print("Upload AWS Transcription job output (.csv) file.")
uploaded = files.upload()
transcription = next(iter(uploaded.values()))
d = json.loads(transcription.decode())
d = d["results"]["transcripts"][0]["transcript"]

nltk.download('nps_chat')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

posts = nltk.corpus.nps_chat.xml_posts()

def transform(post):
    tokens = nltk.word_tokenize(post.text)
    tagged_tokens = nltk.pos_tag(tokens)

    serialized = ['{}_{}'.format(z[0], z[1])
                  for z in tagged_tokens]

    text = ' '.join(serialized)
    text = text.replace('?', ' QQ')

    return text, 'Question' in post.get('class')

def extract_features(post):
    tokens = nltk.word_tokenize(post)
    tagged_tokens = nltk.pos_tag(tokens)

    serialized = ['{}_{}'.format(z[0], z[1])
                  for z in tagged_tokens]

    text = ' '.join(serialized)
    text = text.replace('?', ' QQ')

    return text

def train_classifier():
    pipeline = Pipeline([
        ('vect', HashingVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ])

    featuresets = [transform(post) for post in posts]
    random.shuffle(featuresets)
    size = int(len(featuresets) * .1)
    train_set, test_set = featuresets[size:], featuresets[:size]

    X, y = zip(*train_set)
    pipeline.fit(X, y)

    X, y = zip(*test_set)
    pred = pipeline.predict(X)

    # pprint([z for z in zip(X, pred, y)
    #         if z[1] != z[2]])

    print('accuracy %f' % pipeline.score(X, y))
    print(classification_report(y, pred))

    return pipeline




classifier = train_classifier()
# data = nltk.sent_tokenize("Is this how it's done? hi this is Jim how are you? is everything ok? what do you mean? why. what time do we start. im not sure ok? that's what I said, we use AWS and serverless")
data = nltk.sent_tokenize(d)
print("Classifier predictions on data:")


transformed_data = map(extract_features, data)
preds = classifier.predict(transformed_data)
print("Input:", data)
# print([("Q detected: "+ data[i]) for i, x in enumerate(preds) if x])
for i, x in enumerate(preds):
  if x:
    print("QQ:", data[i])
  else:
    print(data[i])




Upload AWS Transcription job output (.csv) file.


Saving transcript2.json to transcript2.json
[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
accuracy 0.972538
              precision    recall  f1-score   support

       False       0.98      0.99      0.98       939
        True       0.90      0.85      0.87       117

    accuracy                           0.97      1056
   macro avg       0.94      0.92      0.93      1056
weighted avg       0.97      0.97      0.97      1056

Classifier predictions on data:
Input: ['Well, thank you so much for comment.', "Let's start.", 'Tell us where you grew up.', "Tell us about Munich's this amazing city, right?", 'Or is it not?', 