# Overview
The dataset of this competition contains argumentative essays written by U.S students in grades 6-12. The essays were annotated by expert raters for elements commonly found in argumentative writing. In this competition, we need to identify elements in student writing. More specifically, we need to automatically segment texts and classify argumentative and rhetorical elements in essays written by 6th-12th grade students. We have access to the largest dataset of student writing ever released to test our skills in natural language processing.
We have 15.6k writing documents to train model and train csv files containing relative information about the text files.
we have to find word sequences that can be classified as one of 7 "discourse types". These are:

* **Lead** - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis
* **Position** - an opinion or conclusion on the main question
* **Claim** - a claim that supports the position
* **Counterclaim** - a claim that refutes another claim or gives an opposing reason to the position
* **Rebuttal** - a claim that refutes a counterclaim
* **Evidence** - ideas or examples that support claims, counterclaims, or rebuttals.
* **Concluding Statement** - a concluding statement that restates the claims

In [None]:
# required libraries
import numpy as np
import pandas as pd
import wordcloud
import glob
import spacy
from spacy import displacy
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
# train dataset
train = pd.read_csv('../input/feedback-prize-2021/train.csv')

# conevrt data type from float to int for some specific columns
train[['discourse_id', 'discourse_start', 'discourse_end']] = train[['discourse_id', 'discourse_start', 'discourse_end']].astype(int)
train.head()

In [None]:
# concise summary of a DataFrame.
train.info()

In [None]:
# Ckeck null values
train.isna().sum()

In [None]:
train_files = glob.glob('../input/feedback-prize-2021/train/*.txt') 
test_files = glob.glob('../input/feedback-prize-2021/test/*.txt')

print("Total number of train files: " , len(train_files))
print("Total number of test files: " , len(test_files))

In [None]:
# a train file sample
!cat ../input/feedback-prize-2021/train/423A1CA112E2.txt

In [None]:
sns.countplot(y = 'discourse_type', data = train, palette = 'YlGnBu_r')
plt.title("discourse type count");

In [None]:
plt.figure(figsize=(8,10))
sns.countplot(y = 'discourse_type_num', data = train, palette = 'coolwarm')
plt.title("discourse type num count");

# Discourse Length

In [None]:
# new column contain discourse length
train["discourse_len"] = train["discourse_end"] - train["discourse_start"]

In [None]:
sns.histplot(x = 'discourse_len', data = train, kde = True, color = 'green')
plt.title("Discourse length distribution");

In [None]:
sns.histplot(x = 'discourse_start', data = train, kde = True, color = 'blue')
plt.title("Discourse start distribution");

In [None]:
sns.histplot(x = 'discourse_end', data = train, kde = True, color = 'coral')
plt.title('Discourse end distribution');

# Detail for specific id

In [None]:
# as dataframe
train.query('id == "423A1CA112E2"')

In [None]:
# discourse_type for the specific id
train.query('id == "423A1CA112E2"')["discourse_type"].value_counts().plot(kind = 'bar');

# Full text analysis

In [None]:
# this will create many duplicate text. So we will be removing them in further steps
train['full_text'] = train['discourse_text'].groupby(train['id']).transform(lambda x: ' '.join(x)) 

train.full_text.iloc[0]

In [None]:
# full text length
full_text_length = train['full_text'].drop_duplicates().apply(len) # a pandas series
full_text_length.plot(kind='hist', bins=100, color = 'lightgreen')
plt.title('Essay Length Distribution')
plt.xlabel("Essay Length")
plt.ylabel("Frequency");

In [None]:
word_count = train['full_text'].drop_duplicates().apply(lambda x: len(str(x).split()))
word_count.plot(kind='hist', bins=100, color = 'skyblue')
plt.title('Word Count Distribution')
plt.xlabel("Word Count")
plt.ylabel("Frequency");

In [None]:
plt.scatter(full_text_length, word_count, marker='^', color = 'teal')
plt.title("text length vs word count")
plt.xlabel('Full text lenght')
plt.ylabel('Word Count');

# Color Text

In [None]:
colors = {
            'Lead': '#8000ff',
            'Position': '#2b7ff6',
            'Evidence': '#2adddd',
            'Claim': '#80ffb4',
            'Concluding Statement': 'd4dd80',
            'Counterclaim': '#ff8042',
            'Rebuttal': '#ff0000'
         }

def read_essay(id):
    with open(f"../input/feedback-prize-2021/train/{id}.txt") as f:
        essay = f.read()
    return essay


def visualize(example):
    ents = []
    for i, row in train[train['id'] == example].iterrows():
        ents.append({
                        'start': int(row['discourse_start']), 
                         'end': int(row['discourse_end']), 
                         'label': row['discourse_type']
                    })

    data = read_essay(example)

    doc2 = {
        "text": data,
        "ents": ents,
        "title": example
    }

    options = {"ents": train.discourse_type.unique().tolist(), "colors": colors}
    displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

visualize("423A1CA112E2")

# Ngram

In [None]:
def get_top_n_words(corpus, n=None, remove_stop_words=False, n_words=1): 
    
    if remove_stop_words:
        vec = CountVectorizer(stop_words = 'english', ngram_range=(n_words, n_words)).fit(corpus)
    else:
        vec = CountVectorizer(ngram_range=(n_words, n_words)).fit(corpus)
        
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

### Unigrams

In [None]:
common_words = get_top_n_words(train['full_text'].drop_duplicates(), 20, remove_stop_words=True, n_words=1)
for word, freq in common_words:
    print(word, freq)

### Bigrams

In [None]:
common_words = get_top_n_words(train['full_text'].drop_duplicates(), 20, remove_stop_words=True, n_words=2)
for word, freq in common_words:
    print(word, freq)

### Trigrams

In [None]:
common_words = get_top_n_words(train['full_text'].drop_duplicates(), 20, remove_stop_words=True, n_words=3)
for word, freq in common_words:
    print(word, freq)

# Wordcloud

In [None]:
# wordcloud with respect to discourse type
fig, axs = plt.subplots(7, 1, figsize=(20, 25))

plt_idx = 0

for discourse_type, d in train.groupby("discourse_type"):
    discourse_text = " ".join(d["discourse_text"].values.tolist())
    wordcloud = WordCloud(
        max_font_size=200,
        max_words=200,
        width=1200,
        height=800,
        background_color="white",
    ).generate(discourse_text)
    axs = axs.flatten()
    axs[plt_idx].imshow(wordcloud, interpolation="bilinear")
    axs[plt_idx].set_title(discourse_type, fontsize=18)
    axs[plt_idx].axis("off")
    plt_idx += 1
plt.tight_layout()
plt.show()

## If you find it helpful, please consider upvotting. Thank You