In [None]:
import pandas as pd
import re
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)
input_file = open('/Users/praneeth/Downloads/admin.jsonl').read()
input_file = input_file.strip()

# Add "[]" so we can parse the json using pandas's read_json()
input_file = re.sub(r'^', '[', input_file)
input_file = re.sub(r'$', ']', input_file)

rawData_df = pd.read_json(input_file)
labels = rawData_df['label']
texts = rawData_df['text']
# print(labels)
# print(texts)

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from pprint import pprint

def label_text(label_metadata, text):
    all_labels = []
    for i in range(len(label_metadata)):
        try:
            substr = text[label_metadata[i][0]:label_metadata[i][1]+1]
            label = label_metadata[i][2]
            all_labels.append({'text': substr, 'label': label})
        except TypeError:
            print(f'TypeError detected for: label: {label_metadata[i]} text: {text}')

    pprint(all_labels)
    return all_labels
data_with_labels = [label_text(rawData_df['label'][i], rawData_df['text'][i]) for i in range(len(rawData_df))]
data_with_labels

In [None]:
data_with_labels_df = pd.DataFrame.from_dict(data_with_labels[0])

# Count the length of each sentence, don't count blank spaces
data_with_labels_df['length'] = data_with_labels_df['text'].apply(lambda x: len(x) - x.count(" "))

In [None]:
import string

def count_punct(text):
    try:
        count = sum([1 for char in text if char in string.punctuation])
        return round(count/(len(text) - text.count(" ")), 3)*100
    except ZeroDivisionError:
        return 0

data_with_labels_df['punctuation%'] = data_with_labels_df['text'].apply(lambda x: count_punct(x))
# data_with_labels_df

In [None]:
from matplotlib import pyplot
import numpy as np
%matplotlib inline

In [None]:
data_with_labels_df.plot(x='label', y='length', kind='scatter')
pyplot.title('Length of the text per label distribution')
pyplot.xticks(rotation=90)
pyplot.show()

In [None]:
data_with_labels_df.plot(x='label', y='punctuation%', kind='scatter')
pyplot.title('Punctuation in the text per label distribution')
pyplot.xticks(rotation=90)
pyplot.show()

In [None]:
import nltk
import re

nltk.download('stopwords')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
def clean_text(text):
    """Clean text by
      1. removing punctuation 
      2. tokenize 
      3. lemmatize

      This is called by the vectorizers
    """
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

# data_with_labels_df['text_clean'] = data_with_labels_df['text'].apply(lambda x: clean_text(x.lower()))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Vectorize text into document-term matrix. We could evaluate both TF-IDF and CountVectorizer to see which one is better performing

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data_with_labels_df['text'])
X_tfidf_feat = pd.concat([data_with_labels_df['length'], data_with_labels_df['punctuation%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_tfidf_feat.columns = X_tfidf_feat.columns.astype(str)

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data_with_labels_df['text'])
X_count_feat = pd.concat([data_with_labels_df['length'], data_with_labels_df['punctuation%'], pd.DataFrame(X_count.toarray())], axis=1)
X_count_feat.columns = X_count_feat.columns.astype(str)

data_with_labels_df.columns = data_with_labels_df.columns.astype(str)

In [None]:
# RandomForestClassifier with TF_IDF vectorized data
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_feat, data_with_labels_df['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

In [None]:
# RandomForestClassifier with CountVectorizer vectorized data
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_count_feat, data_with_labels_df['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_with_labels_df[['text', 'length', 'punctuation%']], data_with_labels_df['label'], test_size=0.2)

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['text'])

tfidf_train = tfidf_vect_fit.transform(X_train['text'])
tfidf_test = tfidf_vect_fit.transform(X_test['text'])

X_train_vect = pd.concat([X_train[['length', 'punctuation%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_train_vect.columns = X_train_vect.columns.astype(str)

X_test_vect = pd.concat([X_test[['length', 'punctuation%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)
X_test_vect.columns = X_test_vect.columns.astype(str)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score

rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

rf_model = rf.fit(X_train_vect, y_train)
y_pred = rf_model.predict(X_test_vect)

precision, recall, fscore, train_support = score(y_test, y_pred, labels=data_with_labels_df['label'].unique(), average='weighted')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

gb_model = gb.fit(X_train_vect, y_train)
y_pred = gb_model.predict(X_test_vect)

precision, recall, fscore, train_support = score(y_test, y_pred, labels=data_with_labels_df['label'].unique(), average='weighted')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))