In [None]:
import os
import re
import json
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


os.listdir('/kaggle/input/coleridgeinitiative-show-us-the-data')

In [None]:
train = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train

In [None]:
train.info()

In [None]:
for col in train.columns:
    print(f"{col}: {len(train[col].unique())}")

In [None]:
train['dataset_title'].value_counts()

# Wordcloud of the Articles Titles

In [None]:
from wordcloud import WordCloud, STOPWORDS

words_in_titles = list(train.pub_title.str.split(expand=True).stack())

wordcloud = WordCloud(stopwords = STOPWORDS,
                      background_color = "white",
                      width = 3000,
                      height = 2000
                     ).generate(' '.join(words_in_titles))
plt.figure(1, figsize = (18, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

## Wordclouds of Article Titles by Dataset

In [None]:
from collections import defaultdict

words_in_titles_by_dataset = defaultdict(list)

# Separating out positive and negative words (i.e., words appearing in negative and positive tweets),
# in order to visualize each set of words independently
for _, row in train.iterrows():
    words_in_titles_by_dataset[row['dataset_title']].extend(row['pub_title'].split())

# Defining our word cloud drawing function
def wordcloud_draw(data, color = 'white'):
    wordcloud = WordCloud(stopwords = STOPWORDS,
                          background_color = color,
                          width = 3000,
                          height = 2000
                         ).generate(' '.join(data))
    plt.figure(1, figsize = (12, 8))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()

for dataset_title in train['dataset_title'].unique():
    print("Wordcloud for", dataset_title, ":")
    wordcloud_draw(words_in_titles_by_dataset[dataset_title])

## Loading JSON Contents into a Pandas DataFrame

In [None]:
# Gathering the files paths
train_files = glob.glob("../input/coleridgeinitiative-show-us-the-data/train/*.json")
test_files = glob.glob("../input/coleridgeinitiative-show-us-the-data/test/*.json")

In [None]:
# Generate the training publications dataframe
df_train_publications = pd.DataFrame()

for train_file in train_files:
    file_data = pd.read_json(train_file)
    file_data.insert(0,'pub_id', train_file.split('/')[-1].split('.')[0])
    df_train_publications = pd.concat([df_train_publications, file_data])

df_train_publications.to_csv("df_train_publications.csv",index=False)

df_train_publications

In [None]:
# Generate the testing publications dataframe
df_test_publications = pd.DataFrame()

for test_file in test_files:
    file_data = pd.read_json(test_file)
    file_data.insert(0,'pub_id', test_file.split('/')[-1].split('.')[0])
    df_test_publications = pd.concat([df_test_publications, file_data])

df_test_publications.to_csv("df_test_publications.csv",index=False)

df_test_publications

## Naïve Dataset Title Matching Submission

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
submission_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv', index_col=0)

In [None]:
submission_df

In [None]:
submission_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv', index_col=0)
datasets_titles = [x.lower() for x in train['dataset_title'].unique()]

labels = []
for index in submission_df.index:
    publication_text = df_test_publications[df_test_publications['pub_id'] == index].text.str.cat(sep='\n').lower()
    label = []
    for dataset_title in datasets_titles:
        if dataset_title in publication_text:
            label.append(clean_text(dataset_title))
    labels.append('|'.join(label))

submission_df['PredictionString'] = labels

submission_df.to_csv('submission.csv')

submission_df