In [None]:
from clean_text import clean_text
from process import process

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re  # library for regular expression operations
import string
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import string  # for string operations

from nltk.corpus import stopwords  # module for stop words that come with NLTK
from nltk.stem import PorterStemmer  # module for stemming
from nltk.tokenize import regexp_tokenize  # module for tokenizing strings
from nltk.tokenize import TreebankWordTokenizer
import nltk  # Python library for NLP

import warnings

warnings.simplefilter(action="ignore")
import sys
sys.setrecursionlimit(3000)

In [None]:
import re  # library for regular expression operations

import string  # for string operations

def clean_text(text):
    
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    # remove stock market tickers like $GE
    text = re.sub(r'\$\w*', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    # remove hashtags
    # only removing the hash # sign from the word
    text = re.sub(r'#', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    return text




In [None]:
   def stem_and_stopwords(comment):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    tokenizer = TreebankWordTokenizer()
    comment_tokens = tokenizer.tokenize(comment)
    #print(comment_tokens)
    comments_clean = []
    for word in comment_tokens:
       if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
         # tweets_clean.append(word)
         stem_word = stemmer.stem(word)  # stemming word
         comments_clean.append(stem_word)
    return comments_clean  

In [None]:

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_subm = pd.read_csv('sample_submission.csv')
nltk.download('stopwords')

In [None]:
train.head()

In [None]:
train['comment_text'] = train['comment_text'].apply(lambda x: clean_text(x))

In [None]:
words = ' '.join([text for text in train["comment_text"]])

word_cloud = WordCloud(
    width=1600,
    height=800,
    # colormap='PuRd',
    margin=0,
    max_words=100,  # Maximum numbers of words we want to see
    min_word_length=3,  # Minimum numbers of letters of each word to be part of the cloud
    max_font_size=150, min_font_size=30,  # Font size range
    background_color="white").generate(words)

plt.figure(figsize=(10, 16))
plt.imshow(word_cloud, interpolation="gaussian")
plt.title('Comments and their Nature', fontsize=40)
plt.axis("off")
plt.show()



In [None]:
# Definig a function to remove the stopwords

def remove_stopwords(text):
    
    words = [word for word in text if word not in stopwords.words('english')]
    return words



In [None]:
train['comment_text'].head()

In [None]:
 # Applying the remove_stopwords on train and test set

 train['StemStop'] = train['comment_text'].apply(lambda x: stem_and_stopwords(x))
 test['StemStop'] = test['comment_text'].apply(lambda x: stem_and_stopwords(x))


 train.head()
 test.head()


In [None]:
cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
targets = train[cols].values

train_df = train['comment_text']
test_df = test['comment_text']

In [None]:
val_counts = train[cols].sum()

plt.figure(figsize=(8,5))
ax = sns.barplot(val_counts.index, val_counts.values, alpha=0.8)

plt.title("Comments per Classes")
plt.xlabel("Comment label")
plt.ylabel("Count")

rects = ax.patches
labels = val_counts.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height+5, label, ha="center", va="bottom")


plt.show()

In [None]:
from wordcloud import WordCloud
words = ' '.join([text for text in train['comment_text'] ])


word_cloud = WordCloud(
                       width=1600,
                       height=800,
                       #colormap='PuRd', 
                       margin=0,
                       max_words=500, # Maximum numbers of words we want to see 
                       min_word_length=3, # Minimum numbers of letters of each word to be part of the cloud
                       max_font_size=150, min_font_size=30,  # Font size range
                       background_color="white").generate(words)

plt.figure(figsize=(10, 16))
plt.imshow(word_cloud, interpolation="gaussian")
plt.title('Comments and their Nature', fontsize = 40)
plt.axis("off")
plt.show()

In [None]:
# Word Cloud for test set

words = ' '.join([text for text in test['comment_text'] ])


word_cloud = WordCloud(
                       width=1600,
                       height=800,
                       #colormap='PuRd', 
                       margin=0,
                       max_words=500, # Maximum numbers of words we want to see 
                       min_word_length=3, # Minimum numbers of letters of each word to be part of the cloud
                       max_font_size=150, min_font_size=30,  # Font size range
                       background_color="white").generate(words)

plt.figure(figsize=(10, 16))
plt.imshow(word_cloud, interpolation="bilinear")
plt.title('Comments and their Nature', fontsize = 40)
plt.axis("off")
plt.show()

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
max_features = 22000

tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(train_df))

tokenized_train = tokenizer.texts_to_sequences(train_df)
tokenized_test = tokenizer.texts_to_sequences(test_df)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import initializers, optimizers, layers
from sklearn.metrics import roc_auc_score


maxlen = 200
X_train = pad_sequences(tokenized_train, maxlen = maxlen)
X_test = pad_sequences(tokenized_test, maxlen = maxlen)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, Input,  Activation
from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import initializers, optimizers, layers
from sklearn.metrics import roc_auc_score
embed_size = 128
maxlen = 200
max_features = 22000

inp = Input(shape = (maxlen, ))
x = Embedding(max_features, embed_size)(inp)
x = LSTM(60, return_sequences=True, name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.2)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(6, activation="sigmoid")(x)


In [None]:
model = Model(inputs=inp, outputs=x)
model.compile(
loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy']
)

model.summary()

In [None]:
batch_size = 64
epochs = 5
model.fit(X_train, targets, batch_size=batch_size, epochs=epochs, validation_split=0.1)

In [None]:
prediction = model.predict(X_test)
prediction