# Data description

The dataset of this competition contains argumentative essays that were annotated by expert raters for the following discourse elements:

* Lead - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis
* Position - an opinion or conclusion on the main question
* Claim - a claim that supports the position
* Counterclaim - a claim that refutes another claim or gives an opposing reason to the position
* Rebuttal - a claim that refutes a counterclaim
* Evidence - ideas or examples that support claims, counterclaims, or rebuttals.
* Concluding Statement - a concluding statement that restates the claims

The task is to predict the quality rating of each discourse element. 
Human readers rated each rhetorical or argumentative element, in order of increasing quality, as one of:

* Ineffective
* Adequate
* Effective

## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

import logging
import sys
from time import time

from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer



# Data Examination

In [None]:
train_df = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')
test_df = pd.read_csv('../input/feedback-prize-effectiveness/test.csv')

print(f"Train dataset length: {len(train_df)}")
print(f"Test dataset length: {len(test_df)}")

In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
test_df.head()

In [None]:
test_df.describe()

# Data examples

In [None]:
example_number = 4
discourse_text_example = train_df["discourse_text"][example_number]
discourse_type_example = train_df["discourse_type"][example_number]
print(f"{discourse_text_example} = {discourse_type_example}")


In [None]:
example_number = 20
discourse_text_example = train_df["discourse_text"][example_number]
discourse_type_example = train_df["discourse_type"][example_number]
print(f"{discourse_text_example} = {discourse_type_example}")

# Discourse Texts lengths and distribution

#### Discourse texts are individually labeled parts of an essay

In [None]:
print(f"We have {len(train_df.essay_id.value_counts())} different essays which are split into {len(train_df)} discourse elements")
print(f"Mean number of discourse elements per essay: {len(train_df)/len(train_df.essay_id.value_counts())}")

In [None]:
text_lengths = train_df['discourse_text'].drop_duplicates().apply(len)

plt.plot()
plt.hist(text_lengths, bins=20) 
plt.title("Word Count Distribution in Discourse Text")
plt.xlabel("Text Length")
plt.ylabel("Frequency")

In [None]:
word_count = train_df['discourse_text'].drop_duplicates().apply(lambda x: len(str(x).split()))

plt.plot()
plt.hist(word_count, bins=20) 
plt.title("Word Count Distribution in Discourse Text")
plt.xlabel("No. of words")
plt.ylabel("Frequency")

### Discourse element length per type

In [None]:
train_df["element_len"] = train_df["discourse_text"].apply(len)
fig = plt.figure(figsize=(14,12))

ax1 = fig.add_subplot(211)
ax1 = train_df.groupby('discourse_type')['element_len'].mean().sort_values().plot(kind="barh")
ax1.set_title("Average number of words per Discourse Type", fontsize=14, fontweight = 'bold')
ax1.set_xlabel("Average number of words", fontsize = 10)
ax1.set_ylabel("")

ax2 = fig.add_subplot(212)
ax2 = train_df.groupby('discourse_type')['discourse_type'].count().sort_values().plot(kind="barh")
ax2.set_title("Frequency of Discourse Type in all essays", fontsize=14, fontweight = 'bold')
ax2.set_xlabel("Frequency", fontsize = 10)
ax2.set_ylabel("")



In [None]:
import plotly.express as px

fig = px.bar(x = np.unique(train_df["discourse_type"]),
y = [list(train_df["discourse_type"]).count(i) for i in np.unique(train_df["discourse_type"])] , 
            color = np.unique(train_df["discourse_type"]),
             color_continuous_scale="Spectral") 
fig.update_xaxes(title="Discourse Types")
fig.update_yaxes(title = "Number of Records")
fig.update_layout(showlegend = True,
    title = {
        'text': 'Discourse Type Distribution ',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        template="plotly_white")
fig.show()

# Frequency of Discourse Effectivness

In [None]:
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(211)
ax = train_df.groupby('discourse_effectiveness')['element_len'].count().sort_values().plot(kind="barh")
ax.set_title("Frequency of Discourse Effectivness", fontsize=14, fontweight = 'bold')
ax.set_xlabel("Frequency", fontsize = 10)
ax.set_ylabel("")

plt.tight_layout(pad=2)
plt.show()

In [None]:
values = train_df["discourse_effectiveness"].value_counts()
print(f"Discourse effectivness value counts:\n{values}")

# Wordclouds per Discourse type

In [None]:
train_df["discourse_type"].value_counts()

In [None]:
words = train_df[train_df["discourse_type"] == "Evidence"]["discourse_text"]

unique_string=(" ").join(words)
wordcloud = WordCloud(width = 1000, height = 500, collocations=False).generate(unique_string)
plt.figure(figsize=(15,8))
plt.title("Wordcloud for Evidence")
plt.imshow(wordcloud)

In [None]:
words = train_df[train_df["discourse_type"] == "Claim"]["discourse_text"]

unique_string=(" ").join(words)
wordcloud = WordCloud(width = 1000, height = 500, collocations=False).generate(unique_string)
plt.figure(figsize=(15,8))
plt.title("Wordcloud for Claim")
plt.imshow(wordcloud)

In [None]:
words = train_df[train_df["discourse_type"] == "Position"]["discourse_text"]

unique_string=(" ").join(words)
wordcloud = WordCloud(width = 1000, height = 500, collocations=False).generate(unique_string)
plt.figure(figsize=(15,8))
plt.title("Wordcloud for Position")
plt.imshow(wordcloud)

In [None]:
words = train_df[train_df["discourse_type"] == "Concluding Statement"]["discourse_text"]

unique_string=(" ").join(words)
wordcloud = WordCloud(width = 1000, height = 500, collocations=False).generate(unique_string)
plt.figure(figsize=(15,8))
plt.title("Wordcloud for Concluding Statement")
plt.imshow(wordcloud)

In [None]:
words = train_df[train_df["discourse_type"] == "Lead"]["discourse_text"]

unique_string=(" ").join(words)
wordcloud = WordCloud(width = 1000, height = 500, collocations=False).generate(unique_string)
plt.figure(figsize=(15,8))
plt.title("Wordcloud for Lead")
plt.imshow(wordcloud)

In [None]:
words = train_df[train_df["discourse_type"] == "Counterclaim"]["discourse_text"]

unique_string=(" ").join(words)
wordcloud = WordCloud(width = 1000, height = 500, collocations=False).generate(unique_string)
plt.figure(figsize=(15,8))
plt.title("Wordcloud for Counterclaim")
plt.imshow(wordcloud)

In [None]:
words = train_df[train_df["discourse_type"] == "Rebuttal"]["discourse_text"]

unique_string=(" ").join(words)
wordcloud = WordCloud(width = 1000, height = 500, collocations=False).generate(unique_string)
plt.figure(figsize=(15,8))
plt.title("Wordcloud for Rebuttal")
plt.imshow(wordcloud)

### Thank you for reading! I hope this was helpful!