<p style='text-align: center;'><span style="color: #000508; font-family: Segoe UI; font-size: 2.6em; font-weight: 300;">Coleridge Initiative - Show US the Data</span></p>
<p style='text-align: center;'><span style="color: #000508; font-family: Segoe UI; font-size: 2.6em; font-weight: 300;">Let's See the Data🔥</span></p>

![](https://cusp.nyu.edu/wp-content/uploads/2018/09/CI_horizontal.png)

<span style="color: #0087e4; font-family: Segoe UI; font-size: 2.3em; font-weight: 300;">Import Packages</span>

In [None]:
import os
import re
import json
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.express as px

from wordcloud import WordCloud
from collections import Counter
from functools import partial

from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
stoplist = stopwords.words('english')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
TRAIN_DIR = "../input/coleridgeinitiative-show-us-the-data/train"
TEST_DIR = "../input/coleridgeinitiative-show-us-the-data/test"

<span style="color: #0087e4; font-family: Segoe UI; font-size: 2.3em; font-weight: 300;">Load the Dataframe</span>

In [None]:
train = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")
sample_sub = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/sample_submission.csv")
train.head()

<span style="color: #0087e4; font-family: Segoe UI; font-size: 2.3em; font-weight: 300;">Basic Exploration</span>

In [None]:
train.info()

In [None]:
print(f"Number of Unique Publication Titles is {train['pub_title'].nunique()}")

In [None]:
print(f"Number of Unique Dataset Titles is {train['dataset_title'].nunique()}")

In [None]:
print(f"Number of Unique Dataset Labels is {train['dataset_label'].nunique()}")

In [None]:
print(f"Number of Unique Cleaned Dataset Labels is {train['cleaned_label'].nunique()}")

In [None]:
train['publication_title_words'] = train['pub_title'].apply(lambda x: len(x.split()))
train['dataset_title_words'] = train['dataset_title'].apply(lambda x: len(x.split()))
train['dataset_label_words'] = train['dataset_label'].apply(lambda x: len(x.split()))
train['cleaned_label_words'] = train['cleaned_label'].apply(lambda x: len(x.split()))

<span style="color: #0087e4; font-family: Segoe UI; font-size: 2.3em; font-weight: 300;">Let's Check the Distribution of words</span>

In [None]:
fig = px.histogram(train, x="publication_title_words",
                   marginal="box") # or violin, rug)
fig.update_layout(go.Layout(template= "plotly_dark",title = 'Length of Publication Titles' , xaxis = dict(title = 'Length'), yaxis = dict(title = 'Count')))
fig.show()

In [None]:
fig = px.histogram(train, x="dataset_title_words",
                   marginal="box") # or violin, rug)
fig.update_layout(go.Layout(template= "plotly_dark",title = 'Length of Dataset Titles' , xaxis = dict(title = 'Length'), yaxis = dict(title = 'Count')))
fig.show()

In [None]:
fig = px.histogram(train, x="dataset_label_words",
                   marginal="box") # or violin, rug)
fig.update_layout(go.Layout(template= "plotly_dark",title = 'Length of Dataset Labels' , xaxis = dict(title = 'Length'), yaxis = dict(title = 'Count')))
fig.show()

<span style="color: #0087e4; font-family: Segoe UI; font-size: 2.3em; font-weight: 300;">Now let's see the most frequent words</span>

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 300;">Publication Title</span>

In [None]:
publication_title_list = []
for publication_title in train['pub_title'].tolist():
    words = publication_title.split()
    publication_title_list.extend(words)

In [None]:
publication_title_word_freq = Counter(publication_title_list)

In [None]:
sorted_word_freq = sorted(publication_title_word_freq.items(), key=lambda pair: pair[1], reverse=True)
sorted_word_freq[:5]

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 100;">It is quite indicative that we have a problem of stopwords here</span>

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 300;">Dataset Title</span>

In [None]:
dataset_title_list = []
for dataset_title in train['dataset_title'].tolist():
    words = dataset_title.split()
    dataset_title_list.extend(words)

In [None]:
dataset_title_word_freq = Counter(dataset_title_list)

In [None]:
sorted_word_freq = sorted(dataset_title_word_freq.items(), key=lambda pair: pair[1], reverse=True)
sorted_word_freq[:5]

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 300;">Dataset Label</span>

In [None]:
dataset_label_list = []
for dataset_label in train['dataset_label'].tolist():
    words = dataset_label.split()
    dataset_label_list.extend(words)

In [None]:
dataset_label_word_freq = Counter(dataset_label_list)

In [None]:
sorted_word_freq = sorted(dataset_label_word_freq.items(), key=lambda pair: pair[1], reverse=True)
sorted_word_freq[:5]

<span style="color: #0087e4; font-family: Segoe UI; font-size: 2.3em; font-weight: 300;">Let's Visualize using WordCloud</span>

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 300;">Helper Function</span>

In [None]:
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(40, 30))
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off");

In [None]:
wordcloud = WordCloud(width = 1000, height = 500, random_state=1, colormap='twilight', 
                      font_path='../input/all-elon-musks-tweets/acetone_font.otf', collocations=False)

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 300;">Publication Title</span>

In [None]:
plot_cloud(wordcloud.generate_from_frequencies(publication_title_word_freq))

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 300;">Dataset Title</span>

In [None]:
plot_cloud(wordcloud.generate_from_frequencies(dataset_title_word_freq))

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 300;">Dataset Label</span>

In [None]:
plot_cloud(wordcloud.generate_from_frequencies(dataset_label_word_freq))

<span style="color: #0087e4; font-family: Segoe UI; font-size: 2.3em; font-weight: 300;">Most Common Bigrams/Trigrams</span>

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 300;">Publication Title</span>

In [None]:
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# matrix of ngrams
ngrams = c_vec.fit_transform(train['pub_title'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})


top10_ngrams_freq = df_ngram.head(10)['frequency'].tolist()
top10_ngrams = df_ngram.head(10)['bigram/trigram'].tolist()

fig = go.Figure(data=[go.Table(header=dict(values=['Bigram/Trigram', 'Count'], fill_color='yellow', line_color='darkslategray'),
                 cells=dict(values=[top10_ngrams, top10_ngrams_freq], fill_color='lavender', line_color='darkslategray'))
                     ])
fig.show()

In [None]:
colors = ['rgb(160, 50, 168)']*len(top10_ngrams_freq)

trace = go.Bar(
                x = top10_ngrams_freq[::-1],
                y = top10_ngrams[::-1],
                marker = dict(color = colors,
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                text=top10_ngrams_freq[::-1], textposition='outside', orientation='h')
layout = go.Layout(template= "plotly_dark",title = 'TOP 10 BIGRAMS / TRIGRAMS IN PUBLICATION TITLE' , xaxis = dict(title = 'Count', automargin=True), yaxis = dict(title = 'Bigram/Trigram'))
fig = go.Figure(data = [trace], layout = layout)
fig.show()

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 300;">Dataset Title</span>

In [None]:
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# matrix of ngrams
ngrams = c_vec.fit_transform(train['dataset_title'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})


top10_ngrams_freq = df_ngram.head(10)['frequency'].tolist()
top10_ngrams = df_ngram.head(10)['bigram/trigram'].tolist()

fig = go.Figure(data=[go.Table(header=dict(values=['Bigram/Trigram', 'Count'], fill_color='yellow', line_color='darkslategray'),
                 cells=dict(values=[top10_ngrams, top10_ngrams_freq], fill_color='lavender', line_color='darkslategray'))
                     ])
fig.show()

In [None]:
colors = ['rgb(160, 50, 168)']*len(top10_ngrams_freq)

trace = go.Bar(
                x = top10_ngrams_freq[::-1],
                y = top10_ngrams[::-1],
                marker = dict(color = colors,
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                text=top10_ngrams_freq[::-1], textposition='outside', orientation='h')
layout = go.Layout(template= "plotly_dark",title = 'TOP 10 BIGRAMS / TRIGRAMS IN DATASET TITLE' , xaxis = dict(title = 'Count', automargin=True), yaxis = dict(title = 'Bigram/Trigram'))
fig = go.Figure(data = [trace], layout = layout)
fig.show()

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 300;">Dataset Label</span>

In [None]:
c_vec = CountVectorizer(stop_words=stoplist, ngram_range=(2,3))
# matrix of ngrams
ngrams = c_vec.fit_transform(train['dataset_label'])
# count frequency of ngrams
count_values = ngrams.toarray().sum(axis=0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})


top10_ngrams_freq = df_ngram.head(10)['frequency'].tolist()
top10_ngrams = df_ngram.head(10)['bigram/trigram'].tolist()

fig = go.Figure(data=[go.Table(header=dict(values=['Bigram/Trigram', 'Count'], fill_color='yellow', line_color='darkslategray'),
                 cells=dict(values=[top10_ngrams, top10_ngrams_freq], fill_color='lavender', line_color='darkslategray'))
                     ])
fig.show()

In [None]:
colors = ['rgb(160, 50, 168)']*len(top10_ngrams_freq)

trace = go.Bar(
                x = top10_ngrams_freq[::-1],
                y = top10_ngrams[::-1],
                marker = dict(color = colors,
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                text=top10_ngrams_freq[::-1], textposition='outside', orientation='h')
layout = go.Layout(template= "plotly_dark",title = 'TOP 10 BIGRAMS / TRIGRAMS IN DATASET LABEL' , xaxis = dict(title = 'Count', automargin=True), yaxis = dict(title = 'Bigram/Trigram'))
fig = go.Figure(data = [trace], layout = layout)
fig.show()

<span style="color: #0087e4; font-family: Segoe UI; font-size: 2.3em; font-weight: 300;">Baseline Submission</span>

Code taken from [https://www.kaggle.com/prashansdixit/coleridge-initiative-eda-baseline-model](https://www.kaggle.com/prashansdixit/coleridge-initiative-eda-baseline-model)

<span style="color: #000508; font-family: Segoe UI; font-size: 2.0em; font-weight: 300;">Helper Functions</span>

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    
    return text

In [None]:
def read_append_return(filename, train_files_path=TRAIN_DIR, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
train['text'] = train['Id'].apply(read_append_return)
sample_sub['text'] = sample_sub['Id'].apply(partial(read_append_return, train_files_path=TEST_DIR))

In [None]:
temp_1 = [x.lower() for x in train['dataset_label'].unique()]
temp_2 = [x.lower() for x in train['dataset_title'].unique()]
temp_3 = [x.lower() for x in train['cleaned_label'].unique()]

existing_labels = set(temp_1 + temp_2 + temp_3)
id_list = []
lables_list = []
for index, row in sample_sub.iterrows():
    sample_text = row['text']
    row_id = row['Id']
    temp_df = train[train['text'] == text_cleaning(sample_text)]
    cleaned_labels = temp_df['cleaned_label'].to_list()
    for known_label in existing_labels:
        if known_label in sample_text.lower():
            cleaned_labels.append(clean_text(known_label))
    cleaned_labels = [clean_text(x) for x in cleaned_labels]
    cleaned_labels = set(cleaned_labels)
    lables_list.append('|'.join(cleaned_labels))
    id_list.append(row_id)

In [None]:
submission = pd.DataFrame()
submission['Id'] = id_list
submission['PredictionString'] = lables_list
submission.to_csv('submission.csv', index=False)
submission.head(5)

![Upvote!](https://img.shields.io/badge/Upvote-If%20you%20like%20my%20work-07b3c8?style=for-the-badge&logo=kaggle)