# Data Description

The dataset contains argumentative essays written by U.S students in grades 6-12. The essays were annotated by expert raters for elements commonly found in argumentative writing. The task is to predict the human annotations. You will first need to segment each essay into discrete rhetorical and argumentative elements (i.e., discourse elements) and then classify each element as one of the following: Lead - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis

**Position**- an opinion or conclusion on the main question

**Claim** - a claim that supports the position

**Counterclaim** - a claim that refutes another claim or gives an opposing reason to the position

**Rebuttal**- a claim that refutes a counterclaim

**Evidence**- ideas or examples that support claims, counterclaims, or rebuttals.

**Concluding Statement**- a concluding statement that restates the claims

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import nltk

# Load Data

In [None]:
df = pd.read_csv("/kaggle/input/feedback-prize-2021/train.csv")
df.head()

In [None]:
df.shape

# Libraries For Plot

In [None]:
import seaborn as sb
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(10,5))
sb.set_style('whitegrid')
ax = sb.countplot(x='discourse_type',data = df,palette = 'rainbow')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.xlabel('Discourse Type')
plt.tight_layout()
plt.show()

# Creating List And Array

In [None]:
train_texts = list(df.discourse_text)

In [None]:
train_labels = np.array(df.discourse_type)

# Import Libraries

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Use TF-IDF Vectorizer and SelectKBest

In [None]:
# n_gram range for tokenizing text
ngram_range = (1,2)

# Limiting features
top_k = 20000

# Whether text should be split into word or character n-grams.
token_mode = 'word'

# Document/corpus frequency below which a token will be discarded.
MIN_df = 2

# Arguments for tf-idf vectorizer.
kwargs = {
        'ngram_range': ngram_range, 
        'dtype': 'int32',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': token_mode,  # Split text into word tokens.
        'min_df': MIN_df,
}

vectorizer = TfidfVectorizer(**kwargs)

# Learn vocabulary from training texts and vectorize training texts.
x_train = vectorizer.fit_transform(train_texts)

# Select top 'k' of the vectorized features.
selector = SelectKBest(f_classif, k=min(top_k, x_train.shape[1]))
selector.fit(x_train, train_labels)
x_train = selector.transform(x_train).astype('float32')

# Importing LinearSVC

In [None]:
from sklearn.svm import LinearSVC

model = LinearSVC()

model.fit(x_train, train_labels)

In [None]:
#Create dict with each testing essay's text and id
def create_test_list():
    total_list = []
    
    Test_Dict = "../input/feedback-prize-2021/test"
    for filename in os.listdir(Test_Dict):
        file_path = os.path.join(Test_Dict, filename)
        # checking if it is a file
        if os.path.isfile(file_path) and os.path.splitext(file_path)[1] == ".txt":
            with open(file_path) as f:
                    total_list.append({
                        'text' : f.read(), 
                        'id' : os.path.splitext(filename)[0]
                    })
    
    return total_list

In [None]:
test_texts =  create_test_list()

In [None]:
test_texts

# Prediction

In [None]:
pred_dicts_list = []

for test_text in test_texts:
    
    total_word_count = 0
    
    tokenized_sentences = nltk.sent_tokenize(test_text["text"])
    
    x_test = vectorizer.transform(tokenized_sentences)
    x_test = selector.transform(x_test).astype('float32')
    preds = model.predict(x_test) #Returns list
    
    
    for i, pred in enumerate(preds):
        
        # Generate prediction strings for each predicted discourse
        tokenized_sentence = tokenized_sentences[i]
        
        if i == 0 or preds[i-1] != pred:
            prediction_string = ""
        
        for x in range(total_word_count, total_word_count + len(tokenized_sentence.split())):
            prediction_string += f"{x} "
        
        total_word_count += len(tokenized_sentence.split())
        
        try:
            if preds[i+1] == pred:
                continue
        except:
            pass
        
        pred_dicts_list.append({
            "id" : test_text["id"],
            "class" : pred, 
            "predictionstring" : prediction_string.strip()
        })

# Submission

In [None]:
submit = pd.DataFrame(pred_dicts_list)

In [None]:
submit.to_csv("submission.csv", index=False)