# 1. Import & Set & Load

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
COMP_DIR = "../input/feedback-prize-effectiveness/"

pd.set_option('display.max_colwidth', 80)
pd.set_option("display.precision", 3)

plt.rcParams["figure.figsize"] = [12, 6]

In [None]:
submission = pd.read_csv(COMP_DIR + "sample_submission.csv")
test = pd.read_csv(COMP_DIR + "test.csv")
train = pd.read_csv(COMP_DIR + "train.csv")

### Description:
* **sample_submission.csv** - A sample submission file in the correct format.
* **test.csv** - Annotations for the test set essays, containing all of the fields of train.csv except the target, discourse_effectiveness.
* **train.csv** - Contains the annotated discourse elements for all essays in the test set.

# 2. Simple check

In [None]:
check_data = [
    ('submission', submission),
    ('test data', test),
    ('train data', train)
]

In [None]:
# *** Columns ***
for name, df in check_data:
    print(f'{name:<11}:', *df.columns, sep='\t')

### Description:
* **discourse_id** - ID code for discourse element
* **essay_id** - ID code for essay response. This ID code corresponds to the name of the full-text file in the train/ folder.
* **discourse_text** - Text of discourse element.
* **discourse_type** - Class label of discourse element.
* **discourse_type_num** - Enumerated class label of discourse element .
* **discourse_effectiveness** - Quality rating of discourse element, the target.

In [None]:
# *** Shape ***
for name, df in check_data:
    print(f'{name:<12}: {df.shape}')

In [None]:
# *** Head ***
n_row = 3
for name, df in check_data:
    print(f'=== {name.upper()} ===')
    
    display(df.head(n_row))

In [None]:
# *** Describe ***
for name, df in check_data:
    print(f'=== {name.upper()} ===')
    
    display(df.describe())

# 3. Check type-effectiveness

### discourse_type

These essays were annotated by expert raters for discourse elements commonly found in argumentative writing:

* **Lead** - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis
* **Position** - an opinion or conclusion on the main question
* **Claim** - a claim that supports the position
* **Counterclaim** - a claim that refutes another claim or gives an opposing reason to the position
* **Rebuttal** - a claim that refutes a counterclaim
* **Evidence** - ideas or examples that support claims, counterclaims, or rebuttals.
* **Concluding** Statement - a concluding statement that restates the claims

### discourse_effectiveness

Human readers rated each rhetorical or argumentative element, in order of increasing quality, as one of:

1. **Ineffective**
2. **Adequate**
3. **Effective**

In [None]:
params_type = {
    'index': train['discourse_type'],
    'columns': train['discourse_effectiveness']
}

In [None]:
pd.crosstab(margins=True, margins_name='All', **params_type)

In [None]:
pd.crosstab(normalize='columns', **params_type).mul(100).round(2)

In [None]:
pd.crosstab(normalize='index', **params_type).mul(100).round(2)

In [None]:
pd.crosstab(**params_type).plot.barh(stacked=True);

In [None]:
params_effective = {
    'index': train['discourse_effectiveness'],
    'columns': train['discourse_type']
}

In [None]:
pd.crosstab(margins=True, margins_name='All', **params_effective)

In [None]:
pd.crosstab(normalize='columns', **params_effective).mul(100).round(2)

In [None]:
pd.crosstab(normalize='index', **params_effective).mul(100).round(2)

In [None]:
pd.crosstab(**params_effective).plot.barh(stacked=True);

# 4. Check columns (train data)

In [None]:
df = train
df.head()

In [None]:
max_str_len = 55
max_n_value = 10

for name in df.columns:
    col = df[name]
    col_len = len(col)
    col_unique = col.nunique()
    col_freq = col.value_counts().max()
    
    print(f'\n=== "{name}" column ===\n')
    print(f'len:      {col_len}')
    print(f'unique:   {col_unique}')
        
    if max_n_value > col_unique > 1:
        print('values:  ', *col.unique())
        print(f'max freq: {col_freq}\n')
        col.value_counts().plot.barh()
        plt.title(f'Frequency {name}')
        plt.show()
        
        display(col.value_counts().nlargest(col_unique).to_frame('top freq'))
    else:
        first_value = col[0]
        
        if isinstance(first_value, str):
            if len(first_value) > max_str_len:
                first_value = first_value[:max_str_len] + ' ...'
                
        print('sample:  ', first_value)

        print(f'max freq: {col_freq}')
        if col_freq > 1:
            print()
            col.value_counts().hist(bins=col_freq)
            plt.title(f'Frequency {name}')
            plt.show()
            
            display(col.value_counts().nlargest(max_n_value).to_frame('top freq'))

# 5. Check discourse_text (train data)

## 5.1 String length

In [None]:
disc_col = "discourse_text"

discourse_len = pd.DataFrame({
    'text': df[disc_col],
    'origin': df[disc_col].str.len(),
    'trim': df[disc_col].str.strip().str.len()
})
discourse_len['diff'] = discourse_len['origin'] - discourse_len['trim']

discourse_len.head()

In [None]:
discourse_len.sort_values('origin', ascending=False)

In [None]:
discourse_len['origin'].plot.hist(bins=40);

In [None]:
check_max_len = 1000

In [None]:
# *** More than check_max_len ***
mask = discourse_len['origin'] > check_max_len
discourse_len.loc[mask, 'origin'].plot.hist(bins=50);

In [None]:
# *** Less than check_max_len ***
mask = discourse_len['origin'] <= check_max_len
discourse_len.loc[mask, 'origin'].plot.hist(bins=50);

In [None]:
discourse_len.sort_values('diff', ascending=False)

In [None]:
max_diff = discourse_len['diff'].max()
discourse_len['diff'].plot.hist(bins=max_diff);

In [None]:
top_diff = discourse_len.nlargest(10, 'diff')
top_diff

In [None]:
check_discourse = top_diff.index[:3]
for i, x in enumerate(check_discourse, start=1):
    x_origin = df.loc[x, disc_col]
    x_trim = x_origin.strip()
    print(f'\nN{i} === index: {x} ===')
    print(f'\n>>> origin text (len {len(x_origin)}):')
    print(repr(x_origin))
    print(f'\n>>> trimmed text (len {len(x_trim)}):')
    print(repr(x_trim))

## 5.2 Number of words

In [None]:
def get_words(string):
    pattern = ';-,."\''
    return [w.strip(pattern) for w in string.split()]


def get_nwords(string):
    return len(get_words(string))

In [None]:
get_words("Hi, i'm Isaac, i'm going' to; be writing\n\n about")

In [None]:
get_nwords("Hi, i'm Isaac, i'm going' to; be writing\n\n about")

In [None]:
discourse_nwords = pd.DataFrame({
    'text': df[disc_col],
    'number': df[disc_col].apply(get_nwords),
    'type': df['discourse_type'],
    'effective': df['discourse_effectiveness']
})

discourse_nwords.head()

In [None]:
discourse_nwords.sort_values('number', ascending=False)

In [None]:
discourse_nwords['number'].plot.hist(bins=40);

In [None]:
check_min_number = 3

In [None]:
# *** Less than check_min_number ***
mask = discourse_nwords['number'] <= check_min_number
discourse_nwords.loc[mask, 'number'].plot.hist(bins=check_min_number);

In [None]:
discourse_nwords.loc[mask, :].sort_values('number', ascending=False)

In [None]:
# *** More than check_min_number ***
mask = discourse_nwords['number'] > check_min_number
discourse_nwords.loc[mask, 'number'].plot.hist(bins=50);

In [None]:
discourse_nwords.loc[mask, :].sort_values('number', ascending=False)

# 6. Check discourse_type (train data)

In [None]:
disc_col = "discourse_type"
disc_id = "discourse_id"
essay_id = "essay_id"

type_info = pd.pivot_table(df, values=disc_id,
                           index=essay_id, columns=[disc_col],
                           aggfunc='count', fill_value=0,
                           margins=True, margins_name='All types')

type_info

In [None]:
check_essay_id = type_info.index[0]
print(f'{essay_id}: {check_essay_id}')

df.loc[df[essay_id] == check_essay_id, :]

In [None]:
freq_type_info = pd.pivot_table(df, values=disc_id,
                                index=essay_id, columns=[disc_col],
                                aggfunc='count', fill_value=0
).sum(axis=1)
min_freq = freq_type_info.min()
max_freq = freq_type_info.max()

In [None]:
type_info.loc[type_info['All types'] == max_freq]

In [None]:
type_info.loc[type_info['All types'] == min_freq]

In [None]:
freq_type_info.value_counts().plot.barh();

# 7. Check discourse_effectiveness (train data)

In [None]:
disc_col = "discourse_effectiveness"
disc_id = "discourse_id"
essay_id = "essay_id"

type_effective = pd.pivot_table(df, values=disc_id,
                                index=essay_id, columns=[disc_col],
                                aggfunc='count', fill_value=0,
                                margins=True, margins_name='All types')

type_effective

In [None]:
check_essay_id = type_effective.index[0]
print(f'{essay_id}: {check_essay_id}')

df.loc[df[essay_id] == check_essay_id, :]

In [None]:
freq_type_effective = pd.pivot_table(df, values=disc_id,
                                     index=essay_id, columns=[disc_col],
                                     aggfunc='count', fill_value=0
).sum(axis=1)
min_freq = freq_type_effective.min()
max_freq = freq_type_effective.max()

In [None]:
type_effective.loc[type_effective['All types'] == max_freq]

In [None]:
type_effective.loc[type_effective['All types'] == min_freq]

In [None]:
freq_type_effective.value_counts().plot.barh();