# Coleridge Initiative EDA
### Analysis of publication titles, including: 
1. Most frequent words in the dataset publication titles of lengths 3-10
2. Bi, tri and quad-grams 


In [None]:
import numpy as np
import pandas as pd 
import os
import math
from matplotlib import pyplot as plt
import seaborn as sns
import json
import glob
import wordcloud
import string

## Load the data

In [None]:
train = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train.head()

In [None]:
#force lower case and remove punctuation
train.pub_title = train.pub_title.str.lower()
train.pub_title = train.pub_title.str.translate(str.maketrans("", "", string.punctuation+'’'))

## Find common words in the titles

In [None]:
title_words = train.pub_title.str.lower().str.split(expand=True).stack().value_counts()
title_words.sort_values(ascending=False,inplace=True)
title_words = pd.DataFrame(title_words).reset_index()
title_words.columns=['word','count']
title_words

## Create the word length datasets

In [None]:
title_words['word_length']=title_words.word.str.len()

all_top_words = title_words.iloc[:10]

three_letters_top_words = title_words[title_words['word_length']==3].iloc[:10]
four_letters_top_words = title_words[title_words['word_length']==4].iloc[:10]
five_letters_top_words = title_words[title_words['word_length']==5].iloc[:10]
six_letters_top_words = title_words[title_words['word_length']==6].iloc[:10]
seven_letters_top_words = title_words[title_words['word_length']==7].iloc[:10]
eight_letters_top_words = title_words[title_words['word_length']==8].iloc[:10]
nine_letters_top_words = title_words[title_words['word_length']==9].iloc[:10]
ten_letters_top_words = title_words[title_words['word_length']>=10].iloc[:10]

# Visuals - top words of n length

In [None]:
palette = sns.color_palette("rocket",10)

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x='count',y='word',data=all_top_words,palette=palette)
plt.title('Top 10 Words in Publication Titles',fontsize=20)

In [None]:
top_word_dfs = [three_letters_top_words,four_letters_top_words, five_letters_top_words,
                six_letters_top_words, seven_letters_top_words, eight_letters_top_words,
               nine_letters_top_words,ten_letters_top_words]
n=2

for i in top_word_dfs:
    plt.figure(figsize=(10,8))
    n += 1
    sns.barplot(x='count',y='word',data=i,palette=palette)
    title = ('Top 10 Words in Publication Titles, of length ' + str(n))
    if n==10: title = title + ' or more'
    plt.title(title,fontsize=20)
    plt.show()

## Process data for n-grams

In [None]:
# Create bi-grams
bigrams = [a[0] + "-" + a[1] for b in train.pub_title for a in zip(b.split(" ")[:-1], b.split(" ")[1:])]
bigrams = pd.DataFrame(bigrams,columns=['bigrams'])
bigrams = bigrams.value_counts()
bigrams.sort_values(ascending=False,inplace=True)
bigrams = pd.DataFrame(bigrams).reset_index()
bigrams.columns=['bigram','count']
bigrams.sort_values(by='count',ascending=False,inplace=True)

In [None]:
# Create tri-grams
trigrams = [a[0] + "-" + a[1] + "-" + a[2] for b in train.pub_title for a in zip(b.split(" ")[:-1],b.split(" ")[1:], b.split(" ")[2:])]
trigrams = pd.DataFrame(trigrams,columns=['trigrams'])
trigrams = trigrams.value_counts()
trigrams.sort_values(ascending=False,inplace=True)
trigrams = pd.DataFrame(trigrams).reset_index()
trigrams.columns=['trigram','count']
trigrams.sort_values(by='count',ascending=False,inplace=True)

In [None]:
# Create quad-grams
quadgrams = [a[0] + "-" + a[1] + "-" + a[2] + "-" + a[3] for b in train.pub_title for a in zip(b.split(" ")[:-1],b.split(" ")[1:],b.split(" ")[2:],b.split(" ")[3:])]
quadgrams = pd.DataFrame(quadgrams,columns=['quadgrams'])
quadgrams = quadgrams.value_counts()
quadgrams.sort_values(ascending=False,inplace=True)
quadgrams = pd.DataFrame(quadgrams).reset_index()
quadgrams.columns=['quadgram','count']
quadgrams.sort_values(by='count',ascending=False,inplace=True)

# Visuals - top n-grams

In [None]:
palette = sns.color_palette("mako",10)

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x='count',y='bigram',data=bigrams.iloc[:10],palette=palette)
plt.title('Top 10 Bi-Grams in Publication Titles',fontsize=20)

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x='count',y='trigram',data=trigrams.iloc[:10],palette=palette)
plt.title('Top 10 Tri-Grams in Publication Titles',fontsize=20)

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x='count',y='quadgram',data=quadgrams.iloc[:10],palette=palette)
plt.title('Top 10 Quad-Grams in Publication Titles',fontsize=20)

## WIP


### Please upvote if you enjoyed or want more...