In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Description

The dataset presented here contains argumentative essays written by U.S students in grades 6-12. These essays were annotated by expert raters for discourse elements commonly found in argumentative writing:

- Lead - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis
- Position - an opinion or conclusion on the main question
- Claim - a claim that supports the position
- Counterclaim - a claim that refutes another claim or gives an opposing reason to the position
- Rebuttal - a claim that refutes a counterclaim
- Evidence - ideas or examples that support claims, counterclaims, or rebuttals.
- Concluding Statement - a concluding statement that restates the claims

Your task is to predict the quality rating of each discourse element. Human readers rated each rhetorical or argumentative element, in order of increasing quality, as one of:

- Ineffective
- Adequate
- Effective

**Training Data**

The training set consist of a .csv file containing the annotated discourse elements each essay, including the quality ratings, together with .txt files containing the full text of each essay. It is important to note that some parts of the essays will be unannotated (i.e., they do not fit into one of the classifications above) and they will lack a quality rating. We do not include the unannotated parts in train.csv.

- train.csv - Contains the annotated discourse elements for all essays in the test set.

- discourse_id - ID code for discourse element
- essay_id - ID code for essay response. This ID code corresponds to the name of the full-text file in the train/ folder.
- discourse_text - Text of discourse element.
- discourse_type - Class label of discourse element.
- discourse_type_num - Enumerated class label of discourse element .
- discourse_effectiveness - Quality rating of discourse element, the target.

Example Test Data
To help you author submission code, we include a few example instances selected from the test set. When you submit your notebook for scoring, this example data will be replaced by the actual test data, including the sample_submission.csv file.

test/ - A folder containing an example essay from the test set. The actual test set comprises about 3,000 essays in a format similar to the training set essays. The test set essays are distinct from the training set essays.
test.csv - Annotations for the test set essays, containing all of the fields of train.csv except the target, discourse_effectiveness.
sample_submission.csv - A sample submission file in the correct format. See the Evaluation page for more details.

In [None]:
train_df = pd.read_csv("/kaggle/input/feedback-prize-effectiveness/train.csv")
test_df = pd.read_csv("/kaggle/input/feedback-prize-effectiveness/test.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

## EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
# check if there is any null value
print(train_df.isnull().sum())
print("-"*50)
sns.heatmap(train_df.isnull());

In [None]:
# EDA for discourse_type
print(train_df['discourse_type'].value_counts())
print("-"*50)
sns.countplot(train_df['discourse_type']);
plt.xticks(rotation=90);

In [None]:
# EDA for discourse_effectiveness
print(train_df['discourse_effectiveness'].value_counts())
print("-"*50)
sns.countplot(train_df['discourse_effectiveness']);

# we can see that we have unbalanced data

### Generate Statistical Count Features

In [None]:
!pip install chart_studio

In [None]:
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly
import chart_studio.plotly as py
from plotly import tools
init_notebook_mode(connected=True)

import string
punc = string.punctuation

In [None]:
df = train_df.copy()
df.head()

In [None]:
df['word_count'] = df['discourse_text'].apply(lambda x : len(x.split()))
df['char_count'] = df['discourse_text'].apply(lambda x : len(x.replace(" ","")))
df['word_density'] = df['word_count'] / (df['char_count'] + 1)
df['punc_count'] = df['discourse_text'].apply(lambda x : len([a for a in x if a in punc]))

df[['word_count', 'char_count', 'word_density', 'punc_count']].head(10)

**Word count Distrubiton of discourse_text**

In [None]:
words = df.word_count
trace1 = go.Histogram(x=words, opacity=0.65, name="Word Count", marker=dict(color='rgba(17, 50, 96, 0.6)'))
data = [trace1]
layout = go.Layout(barmode='overlay',
                   title='Word Count of discourse_text',
                   xaxis=dict(title='Word Count'),
                   yaxis=dict( title='Number of discourse_text'))
fig = go.Figure(data=data, layout=layout)
iplot(fig)

**Character count Distrbution**

In [None]:
chars = df.char_count
trace1 = go.Histogram(x=chars, opacity=0.65, name="Word Count", marker=dict(color='rgba(120, 40, 106, 0.6)'))
data = [trace1]
layout = go.Layout(barmode='overlay',
                   title='Char Count of discourse_text',
                   xaxis=dict(title='Char Count'),
                   yaxis=dict( title='Number of discourse_text'))
fig = go.Figure(data=data, layout=layout)
iplot(fig)

**Word Density**

In [None]:
wd = df.word_density
trace1 = go.Histogram(x=wd, opacity=0.65, name="Word Count", marker=dict(color='rgba(0, 120, 0, 0.6)'))
data = [trace1]
layout = go.Layout(barmode='overlay',
                   title='Word Density of discourse_text',
                   xaxis=dict(title='Word Density'),
                   yaxis=dict( title='Number of discourse_text'))
fig = go.Figure(data=data, layout=layout)
iplot(fig)

**Punctuation Count**

In [None]:
punc_count = df.punc_count
trace1 = go.Histogram(x=punc_count, opacity=0.75, name="Word Count", marker=dict(color='rgba(10, 22, 200, 0.6)'))
data = [trace1]
layout = go.Layout(barmode='overlay',
                   title='Punctuation Count of discourse_text',
                   xaxis=dict(title='Punctuation Count'),
                   yaxis=dict( title='Number of discourse_text'))
fig = go.Figure(data=data, layout=layout)
iplot(fig)

## Generating WordCloud

In [None]:
df.columns

In [None]:
df['discourse_effectiveness'].value_counts()

In [None]:
def generate_text(text):
    type_text = df[df['discourse_effectiveness']==text]
    value = str(type_text['discourse_text'])
    return value

In [None]:
from wordcloud import WordCloud

wordcloud1 = WordCloud(background_color='white').generate(generate_text("Adequate"))
wordcloud2 = WordCloud(background_color='white').generate(generate_text("Effective"))
wordcloud3 = WordCloud(background_color='white').generate(generate_text("Ineffective"))
wordcloud4 = WordCloud(background_color='white').generate(str(df['discourse_text']))

fig, axes = plt.subplots(2, 2, figsize=(18, 10))

ax = axes[0, 0]
ax.imshow(wordcloud1)
ax.axis('off');
ax.set_title("Adequate", fontsize=30);

ax = axes[0, 1]
ax.imshow(wordcloud2)
ax.axis('off');
ax.set_title("Effective", fontsize=30);

ax = axes[1, 0]
ax.imshow(wordcloud3)
ax.axis('off');
ax.set_title("Ineffective", fontsize=30);

ax = axes[1, 1]
ax.imshow(wordcloud4)
ax.axis('off');
ax.set_title("All 3", fontsize=30);

-- We can see that word "people" takes more space in All 3 plot, and "people" also found in "Adequate", that because we've more data for "Adequate"

In [None]:
"""
    That's it for now, next i'm going to work on model creation, Stay tuned for that..
"""