#Import Libraries

In [None]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import re
from string import punctuation

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer

#Import kaggle data

In [None]:
comments             = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")          #these comments have to be scored
val_data             = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")            #these data includes comparisons of more and less toxic comments (can be used for evaulation)
previous_train_reg   = pd.read_csv("../input/jigsaw-regression-based-data/train_data_version2.csv")        #data from previous competition (to train and evaluate model)

#Data Pre-Processing and Analysis

The first step to handle the task is to do basic text-preprocessing. Text preprocessing steps include a few essential tasks to further clean the available text data. It includes tasks like:


1.   **Stop-Word Removal:** In English words like a, an, the, as, in, on, etc. are considered as stop-words so these words can be removed to reduce vocabulary size. These words don't have some specific meaning or information.
2.   **Lower Casing:** Convert all words into the lower case because the upper or lower case may not make a difference for the problem. By doing so the vocabulary size is further reduced. 
3.   **URL and Punctuation Removal:** Remove URLs and punctuation because they doesn´t have any information for the problem. By doing so the vocabulary size is further reduced. 






In [None]:
comments.head()

In [None]:
val_data.head()

In [None]:
previous_train_reg.head()

##Check for NaNs and remove empty rows (in case they exist)

In [None]:
comments.dropna(subset = ['text'], inplace = True)
val_data.dropna(subset = ['less_toxic', 'more_toxic'], inplace = True)
previous_train_reg.dropna(subset = ['text'], inplace = True)

##Preprocess Data: remove stopwords, URLs (in case they exist), punctuation...

In [None]:
stop_words = stopwords.words('english')

In [None]:
def clean_text(text):
    #make text lowercase and remove urls
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    return text

def preprocess_data(text):
    text = clean_text(text)
    #remove stopwords and punctuation
    text = ' '.join(word for word in text.split() if word not in stop_words)
    text = ' '.join(word.strip(punctuation) for word in text.split() if not word.isdigit())
    return text

In [None]:
#apply functions to dataset
comments['preprocessed_text']           = comments['text'].apply(preprocess_data)
val_data['preprocessed_less_toxic']     = val_data['less_toxic'].apply(preprocess_data)
val_data['preprocessed_more_toxic']     = val_data['more_toxic'].apply(preprocess_data)

previous_train_reg['preprocessed_text'] = previous_train_reg['text'].apply(preprocess_data)   

#Creating an Embedding and Using regression methods to evaluate the comments

##Creating an Embedding Using TfidfVectorizer

"TF-IDF is a statistical measure used to determine the mathematical significance of words in documents. The vectorization process is similar to One Hot Encoding. Alternatively, the value corresponding to the word is assigned a TF-IDF value instead of 1. The TF-IDF value is obtained by multiplying the TF and IDF values." (Word Embedding Techniques: Word2Vec and TF-IDF Explained by Adem Akdogan (Jul 22, 2021))

In [None]:
vec = TfidfVectorizer(analyzer='char_wb', max_df=0.7, min_df=1, ngram_range=(2, 5) )

In [None]:
#Learn vocabulary and IDF
X = vec.fit_transform(previous_train_reg['preprocessed_text'])

In [None]:
y = previous_train_reg['y']

In [None]:
reg = Ridge(alpha=0.1)
reg.fit(X, y)

##Create Submission File

In [None]:
eval_vec_reg = vec.transform(comments['preprocessed_text'])

In [None]:
#create predictions 
eval_preds_reg = reg.predict(eval_vec_reg)

In [None]:
#create submission file
sub = comments
sub['score'] = eval_preds_reg
sub[['comment_id', 'score']].to_csv("submission.csv", index=False)