# Evaluating Student Writing EDA Notebook 📝

# Import dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from termcolor import colored
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

# Load the training data from the competition dataset 🔍

In [None]:
df = pd.read_csv('../input/feedback-prize-2021/train.csv')

In [None]:
df.head(5)

# Each text has an id, discourse_id and the starting/ending point of each discourse, and the type. Let's find how many of each type there are.

In [None]:
plt.figure(figsize=(10,5))
df.groupby("discourse_type").count()['id'].sort_values().plot(kind='barh')
plt.show()

## Claim is the most frequent discourse type, rebuttal is the least.

## How many words on avg per type?

In [None]:
df['text_len'] = df['discourse_text'].apply(lambda x: len(x))

In [None]:
plt.figure(figsize=(10,5))
df.groupby('discourse_type')['text_len'].mean().sort_values().plot(kind='barh')
plt.show()

* ##   Claim is the most frequent one, but it is also the smaller type of discourse. 🤔
* ## Evidence is the second most frequent and the longest type.

In [None]:
df.groupby('discourse_type')['text_len'].describe()

## Let's check the outliers.

In [None]:
df[(df['discourse_type'] == 'Claim') & (df['text_len'] < 10)]

## Finding a text

In [None]:
text = df[(df['discourse_type'] == 'Claim') & (df['text_len'] < 10)].iloc[0]['id']

In [None]:
text = open(f'../input/feedback-prize-2021/train/{text}.txt')

In [None]:
print(text.read())

# On the DataFrame we have infomation about the starting and ending of each discourse type, lets create a function to print the text with each type on a different colour.

In [None]:
def print_type_color(idx, df):
    #discourse_id = df.iloc[idx]['id']
    discourse_id = idx
    text_df = df[df['id'] == discourse_id]
    start_type = df[df['id'] == discourse_id].set_index('id')[['discourse_start','discourse_end','discourse_type']]
    
    colors = {
        'Lead':'red',
        'Position': 'blue',
        'Evidence': 'green',
        'Claim': 'magenta',
        'Counterclaim': 'grey',
        'Rebuttal': 'magenta',
        'Concluding Statement': 'cyan'
    }
    
    text = open(f'../input/feedback-prize-2021/train/{discourse_id}.txt')
    text = text.read()
        
    for i in range(len(start_type)):
        s = int(start_type.iloc[i]['discourse_start'])
        e = int(start_type.iloc[i]['discourse_end'])
        t = start_type.iloc[i]['discourse_type']
        print(colored (t, attrs=['bold']))
        print(
            colored(text[s:e], colors[t])
        )

# Testing the function, it takes an id as an input.

In [None]:
print_type_color('423A1CA112E2',df)

# Let's explore some of the outliers on the data (i.e. discourse type with a small length)

In [None]:
df[(df['discourse_type'] == 'Claim') & (df['text_len'] < 10)]

In [None]:
print_type_color('4B6C254FEE39',df)

# This particular essay has 4 Claims, and even tought they are next to each other they are separated, lets fix this.

## We will create a function that given an index, will groupby with the "id" and join the types which are equal and together.

In [None]:
def join_types(idx,df):
    #discourse_id = df.iloc[idx]['id']
    discourse_id = idx
    text_df = df[df['id'] == discourse_id]
    start_type = df[df['id'] == discourse_id].set_index('id')[['discourse_start','discourse_end','discourse_type']]
        
    text = open(f'../input/feedback-prize-2021/train/{discourse_id}.txt')
    text = text.read()
    
    out_df = pd.DataFrame(columns = list(text_df.columns))
    c = 0
    prev_t = start_type.iloc[0]['discourse_type']
    l = [0]
    
    for i in range(1,len(start_type)):
                
        t = start_type.iloc[i]['discourse_type']
        
        if t == prev_t:
            l.append(i)
        else:
            tmp = text_df.iloc[l[-1]]
            tmp['discourse_start'] = text_df.iloc[l[0]]['discourse_start']
            
            tot = ''
            for j in l:
                tot += text_df['predictionstring'].iloc[j] + ' '
            tot = tot[:-1]
                
            tmp['predictionstring'] = tot
            
            out_df = out_df.append(tmp)
            l = [i]
            
        prev_t = t
    
    return out_df

# Testing on the previous case

In [None]:
df2 = join_types('4B6C254FEE39',df)

In [None]:
df2.head()

In [None]:
print_type_color('4B6C254FEE39',df2)

# Great! Now that separated Claims are all set in one! 😬

# Let's apply it to all the data.

In [None]:
def merge_by_id(df):
    ids = df['id'].unique()
    clean_df = pd.DataFrame(columns = list(df.columns))
    
    for idx in tqdm(ids):
        clean_df = clean_df.append(join_types(idx,df))
    return clean_df

In [None]:
clean_df = merge_by_id(df)

In [None]:
len(df)

In [None]:
len(clean_df)

# The function eliminate around 40k rows, wow! 😮

In [None]:
clean_df.head()

# Using the same exploration as before.

In [None]:
plt.figure(figsize=(10,5))
clean_df.groupby("discourse_type").count()['id'].sort_values().plot(kind='barh')
plt.show()

# Now evidence is the most commont type!

In [None]:
plt.figure(figsize=(10,5))
df.groupby('discourse_type')['text_len'].mean().sort_values().plot(kind='barh')
plt.show()

# Let's write it into a .csv

In [None]:
clean_df.to_csv('clean_data.csv',index=False)

# The function to clean the data took about 23 minutes, feel free to use it. 🤛