# **Task to be Accomplished 🎯**

* Build algorithm(s) to rate the complexity of reading passages for grade 3-12 classroom use

* Given the training data, train Machine Learning model(s) that can predict the relationship between "excerpt" and "target"

* It can be formulated as a Regression problem

# **Import Libraries 📚**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import itertools
from string import punctuation
import string

from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import ngrams

sns.set(style='darkgrid')
sns.set(palette="Set2")

# **Load the Data 📝**

## **Train Data**

In [None]:
df_train = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
print (df_train.shape)
df_train.head()

## **Test Data**

In [None]:
df_test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
print (df_test.shape)
df_test.head()

# **Pre-processing excerpt ✂️**

In [None]:
list_stopwords = set(stopwords.words('english') + list(punctuation))
def pre_prcoessing (df):
    df['processed_excerpt'] = df['excerpt'].str.lower()
    df['processed_excerpt'] = df['processed_excerpt'].apply(word_tokenize)
    df['processed_excerpt'] = df['processed_excerpt'].apply(lambda x: [word for word in x if word not in list_stopwords])
    df['processed_excerpt'] = df['processed_excerpt'].apply(lambda x : [word.translate(str.maketrans('', '', string.punctuation)) for word in x])
    df['processed_excerpt'] = df['processed_excerpt'].apply(lambda x : [word for word in x if len(word) > 1])
    return (df)

In [None]:
df_train = pre_prcoessing(df_train)
df_test = pre_prcoessing(df_test)

# **EDA 📊**

## **Column Distribution - Target**

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(data=df_train, x='target', kde=True, stat='density')
plt.xlabel('Target')
plt.ylabel('Density')
plt.show()

## **Column Distribution - Standard Error**

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(data=df_train, x='standard_error', kde=True, stat='density')
plt.xlabel('Standard Error')
plt.ylabel('Density')
plt.show()

## **Frequency Distribution - Length/ Number of Charcters in each excerpt**

In [None]:
df_train['character_count'] = df_train['excerpt'].str.len()
df_train['processed_character_count'] = df_train['processed_excerpt'].str.join(' ').str.len()

plt.figure(figsize=(12,8))
sns.histplot(data=df_train, x='character_count', kde=True, stat='density', color='red', label='Excerpt')
sns.histplot(data=df_train, x='processed_character_count', kde=True, stat='density', color='blue', label='Processed Excerpt')
plt.xlabel('Number of Characters')
plt.ylabel('Density')
plt.legend()
plt.show()

## **Frequency Distribution - Number of Words in each excerpt**

In [None]:
df_train['word_count'] = df_train['excerpt'].apply(lambda x: len(x.split(' ')))
df_train['processed_word_count'] = df_train['processed_excerpt'].str.join(' ').apply(lambda x: len(x.split(' ')))

plt.figure(figsize=(12,8))
sns.histplot(data=df_train, x='word_count', kde=True, stat='density', color='red', label='Excerpt')
sns.histplot(data=df_train, x='processed_word_count', kde=True, stat='density', color='blue', label='Processed Excerpt')
plt.xlabel('Number of Words')
plt.ylabel('Density')
plt.legend()
plt.show()

## **Frequency Distribution - Number of Unique Words in each excerpt**

In [None]:
df_train['unique_word_count'] = df_train['excerpt'].apply(lambda x: len(set(x.split(' '))))
df_train['processed_unique_word_count'] = df_train['processed_excerpt'].str.join(' ').apply(lambda x: len(set(x.split(' '))))

plt.figure(figsize=(12,8))
sns.histplot(data=df_train, x='unique_word_count', kde=True, stat='density', color='red', label='Excerpt')
sns.histplot(data=df_train, x='processed_unique_word_count', kde=True, stat='density', color='blue', label='Processed Excerpt')
plt.xlabel('Number of Unique Words')
plt.ylabel('Density')
plt.legend()
plt.show()

## **WordCloud**

In [None]:
list_text = df_train['processed_excerpt'].tolist()
list_text = list(itertools.chain(*list_text))

plt.figure(figsize=(16,12))
wordcloud = WordCloud(background_color="black", width=800, height=500, max_font_size=80, max_words=100, collocations = False, colormap='Set2').generate(' '.join(list_text))
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()

## **Top 20 NGrams**

In [None]:
def ngram (list_text, n):
    bigrams = ngrams(list_text, n)
    temp_ = Counter(bigrams).most_common()[:20]
    list_key = [' '.join(temp_[i][0]) for i in range(len(temp_))]
    list_value = [temp_[i][1] for i in range(len(temp_))]
    return (list_key, list_value)

## **UniGram**

In [None]:
plt.figure(figsize=(12,8))
list_key, list_value = ngram(list_text, 1)
sns.barplot(y = list_key, x = list_value)
plt.xlabel('Frequency')
plt.ylabel('UniGram')
plt.show()

## **BiGram**

In [None]:
plt.figure(figsize=(12,8))
list_key, list_value = ngram(list_text, 2)
sns.barplot(y = list_key, x = list_value)
plt.xlabel('Frequency')
plt.ylabel('BiGram')
plt.show()

## **TriGram**

In [None]:
plt.figure(figsize=(12,8))
list_key, list_value = ngram(list_text, 3)
sns.barplot(y = list_key, x = list_value)
plt.xlabel('Frequency')
plt.ylabel('TriGram')
plt.show()

# **Feel free to <span style="color:red">UPVOTE </span> and provide <span style="color:blue">FEEDBACK </span> 🎉**