In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Importing Libraries
import nltk
import unicodedata
import inflect
import re
from nltk.corpus import stopwords
import string
import gc


In [None]:
df = pd.read_table("../input/exmachina/toxicity_annotated_comments.tsv")

In [None]:
df.head()

In [None]:
# Preprocessing:

# Removing all thee non ascii characters
def remove_non_ascii(words): 
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

# Replacing all with lower-cases
def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

# Removing all the punctuations 
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

# Replacing all the numbers with words
def replace_numbers(words):
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

# Removing all the stopwords
def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

# Stemming
def stem_words(words):
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

# Lemmatizing verbs
def lemmatize_verbs(words):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

# Grouping all them in one function
def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

# Removing repetitive punctuations

consequitivedots = re.compile(r'\.{1,}')
consequitivecommas = re.compile(r'\,{1,}')
consequitivequestionmarks = re.compile(r'\?{1,}')
consequitiveexclaimations = re.compile(r'\!{1,}')


In [None]:
# Tokenization
for i in range(len(df)):
    X = nltk.word_tokenize(df["comment"][i])
    df["comment"][i] = normalize(X)
    print(i)


In [None]:
# attaching
for i in range(len(df)):
    df["comment"][i] = ' '.join(map(str, df["comment"][i]))

In [None]:
for j in range(len(df)):
    df["comment"][j] = consequitivedots.sub(' ', df["comment"][j])


In [None]:
for j in range(len(df)):
    df["comment"][j] = consequitivecommas.sub(' ', df["comment"][j])


In [None]:
for j in range(len(df)):
    df["comment"][j] = consequitivequestionmarks.sub(' ', df["comment"][j])
    

In [None]:
for j in range(len(df)):
    df["comment"][j] = consequitiveexclaimations.sub(' ', df["comment"][j])

In [None]:
# label encoding

from sklearn.preprocessing import LabelEncoder
df['logged_in'] = LabelEncoder().fit_transform(df['logged_in'])
df['ns'] = LabelEncoder().fit_transform(df['ns'])
df['sample'] = LabelEncoder().fit_transform(df['sample'])





In [None]:

# removing empty comments
df = df[df["comment"] != '']
# reseting index
df = df.reset_index()



In [None]:
# dropping index column
df = df.drop('index', 1)

# Sanity Check
df.head()



In [None]:
# Saving into csv file
preprocc = pd.DataFrame(df)
preprocc.to_csv('preprocessed.csv', index=False)


In [None]:
# for i in range(len(df)):
#     Y = ' '.join(df["Comment"][i])
#     df["Comment"][i] = Y
    
# df

In [None]:
# preprocc = pd.DataFrame(df)
# preprocc.to_csv('train_preprocessed.csv', index=False)

In [None]:
# df=pd.read_csv("../input/coling/agr_en_train.csv")
# #df=pd.read_csv("dev.csv")

# df.columns=['id','text','tag']
# #easier nonascii removal
# df['text'] = df['text'].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))
# df['text']=df['text'].map(lambda x: x.lower())
# df['text'] = df['text'].str.replace('[^\w\s]','')

# #this may take time so i have comented
# # df['text']=df['text'].astype(str).apply(lambda row: re.sub(r'(?<!\S)\d+(?!\S)', lambda x: p.number_to_words(x.group()), row))

# from nltk.corpus import stopwords
# stop = stopwords.words('english')
# #to have both versions in dataframe
# df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# #i dont like stemming that much

# from nltk.stem import SnowballStemmer
# stemmer = SnowballStemmer('english')
# df['text'] = df['text'].apply(lambda x: ' '.join([stemmer.stem(token) for token in x.split()]))                                       

# def consecutive(text):
#     text = re.compile(r'\.{1,}').sub(' ', text)
#     text = re.compile(r'\,{1,}').sub(' ', text)
#     text = re.compile(r'\?{1,}').sub(' ', text)
#     text = re.compile(r'\!{1,}').sub(' ', text)
#     return text

# from nltk.stem import WordNetLemmatizer
# lemma = WordNetLemmatizer()
# df['text'] = df['text'].apply(lambda x: ' '.join([lemma.lemmatize(word) for word in x.split()]))                                       
# df['text'] =df['text'].apply(consecutive)



# df.to_csv('train_preproc.csv')
# #df.to_csv('dev_preproc.csv')

In [None]:
# train_df = pd.read_csv('../input/preprocessed-csv/preprocessed.csv')
# train_df = train_df.dropna()
# train_df = train_df.reset_index()
# from sklearn.preprocessing import LabelEncoder
# train_df['Agg_Level'] = LabelEncoder().fit_transform(train_df['Agg_Level'])

In [None]:
# type(train_df['Comments'])

In [None]:
# from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
# from sklearn.metrics import make_scorer, f1_score, precision_score, accuracy_score, log_loss
# f1_scorer = make_scorer(f1_score, average="macro")
# precision_scorer = make_scorer(precision_score, average="macro")
# accuracy_scorer = make_scorer(accuracy_score, average="macro")
# log_loss_scorer = make_scorer(log_loss)

# from sklearn.metrics import classification_report, confusion_matrix
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV

# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
# X = train_df.iloc[:, 2].values
# y = train_df.iloc[:, 1].values



In [None]:
# X

In [None]:
# random_state_number = 967898
# x_train, x_test, y_train, y_test = train_test_split(X, y,
#                                                    test_size=0.10, random_state=random_state_number)

# print(x_train.shape, y_train.shape)
# print(x_test.shape, y_test.shape)

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(X)
# X_train_counts.shape

In [None]:
# from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# X_train_tfidf.shape

In [None]:
# cvec = CountVectorizer(stop_words='english', min_df=1, max_df=.5, ngram_range=(1,2))
# tfidf = TfidfTransformer()
# cvec
# cvec.fit(X)
# cvec.fit(X)
# len(cvec.vocabulary_)

In [None]:
# x_train_counts = cvec.fit_transform(x_train, y_train)
# print(x_train_counts.shape)
# x_train_tf = tfidf.fit_transform(x_train_counts)
# print(x_train_tf.shape)

In [None]:
# x_test_counts = cvec.fit_transform(x_test, y_test)
# print(x_test_counts.shape)
# x_test_tf = tfidf.fit_transform(x_test_counts)
# print(x_test_tf.shape)

In [None]:
# len(x_test)

In [None]:
# gc.collect()

In [None]:
# from sklearn.naive_bayes import MultinomialNB

# predicted_prob = MultinomialNB(alpha=0.001).fit(x_test, y_test)
# print("log_loss\n", log_loss(y_test, predicted_prob, labels=range(1,10)))
        
# y_pred = model.predict(x_test)
# print("f1_score\n", f1_score(y_test, y_pred, average="macro"))
# print("accuracy_score\n", accuracy_score(y_test, y_pred))
# print("\nclassification_report\n",classification_report(y_test, y_pred))
# print("\nconfusion_matrix\n",confusion_matrix(y_test, y_pred))

In [None]:
# type(x_train_tf)

In [None]:
# from sklearn.naive_bayes import MultinomialNB
# nb_model = MultinomialNB(alpha=0.001)

In [None]:
# # Preprocessing:

# # Removing all thee non ascii characters
# def remove_non_ascii(words): 
#     new_words = []
#     for word in words:
#         new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
#         new_words.append(new_word)
#     return new_words

# # Replacing all with lower-cases
# def to_lowercase(words):
#     new_words = []
#     for word in words:
#         new_word = word.lower()
#         new_words.append(new_word)
#     return new_words

# # Removing all the punctuations 
# def remove_punctuation(words):
#     new_words = []
#     for word in words:
#         new_word = re.sub(r'[^\w\s]', '', word)
#         if new_word != '':
#             new_words.append(new_word)
#     return new_words

# # Replacing all the numbers with words
# def replace_numbers(words):
#     p = inflect.engine()
#     new_words = []
#     for word in words:
#         if word.isdigit():
#             new_word = p.number_to_words(word)
#             new_words.append(new_word)
#         else:
#             new_words.append(word)
#     return new_words

# # Removing all the stopwords
# def remove_stopwords(words):
#     new_words = []
#     for word in words:
#         if word not in stopwords.words('english'):
#             new_words.append(word)
#     return new_words

# # Stemming
# def stem_words(words):
#     stemmer = LancasterStemmer()
#     stems = []
#     for word in words:
#         stem = stemmer.stem(word)
#         stems.append(stem)
#     return stems

# # Lemmatizing verbs
# def lemmatize_verbs(words):
#     lemmatizer = WordNetLemmatizer()
#     lemmas = []
#     for word in words:
#         lemma = lemmatizer.lemmatize(word, pos='v')
#         lemmas.append(lemma)
#     return lemmas

# # Grouping all them in one function
# def normalize(words):
#     words = remove_non_ascii(words)
#     words = to_lowercase(words)
#     words = remove_punctuation(words)
#     words = replace_numbers(words)
#     words = remove_stopwords(words)
#     return words

# train = df[:10000]
# dev = df[10000:]
# train_labels = train["Agg_Level"]
# dev_labels = dev["Agg_Level"]
# dev

# train_df = []
# dev_df = []
# train_words = []
# dev_words = []
# for i in range(10000):
#     train_df.append(train["Comment"][i])
# for i in range(10000, 12000):
#     dev_df.append(dev["Comment"][i])

# consequitivedots = re.compile(r'\.{1,}')
# consequitivecommas = re.compile(r'\,{1,}')
# consequitivequestionmarks = re.compile(r'\?{1,}')
# consequitiveexclaimations = re.compile(r'\!{1,}')

# for j in range(10000):
#     train_df[j] = consequitivedots.sub(' ', train_df[j])
#     train_df[j] = consequitivecommas.sub(' ', train_df[j])
#     train_df[j] = consequitivequestionmarks.sub(' ', train_df[j])
#     train_df[j] = consequitiveexclaimations.sub(' ', train_df[j])
# for j in range(2000):
#     dev_df[j] = consequitivedots.sub(' ', dev_df[j])
#     dev_df[j] = consequitivecommas.sub(' ', dev_df[j])
#     dev_df[j] = consequitivequestionmarks.sub(' ', dev_df[j])
#     dev_df[j] = consequitiveexclaimations.sub(' ', dev_df[j])
    
    
# # Tokenization
# for i in range(10000):
#     train_words.append(nltk.word_tokenize(train_df[i]))
#     train_words[i] = normalize(train_words[i])

# for i in range(len(dev_df)):
#     dev_words.append(nltk.word_tokenize(dev_df[i]))
#     dev_words[i] = normalize(dev_words[i])

# preprocc = pd.DataFrame(train_words)
# preprocc.to_csv('train_preprocessed.csv', index=False)

# train_words_str = []
# dev_words_str = []
# for i in range(len(train_words)):
#     train_words_str.append(', '.join(train_words[i]))
# for i in range(2000):
#     dev_words_str.append(', '.join(dev_words[i]))
# train_words_str
# dev_words_str

# preprocc = pd.DataFrame(train_words_str)
# preprocc.to_csv('train_preprocessed.csv', index=False)

# dev_preprocc = pd.DataFrame(dev_words_str)
# dev_preprocc.to_csv('dev_preprocessed.csv', index=False)




In [None]:
# X = all_text_train_df.iloc[:, 0].values
# y = all_text_train_df.iloc[:, 1].values

# # Splitting the dataset into the Training set and Test set
# from sklearn.cross_validation import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(X_train)
# from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB().fit(X_train_tfidf, y_train)
# from sklearn.pipeline import Pipeline

# text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

# text_clf = text_clf.fit(X_train, y_train)
# predicted = text_clf.predict(X_test)
# np.mean(predicted == y_test)

In [None]:
# import numpy as np
# import pandas as pd
# import string
# import re

# df=pd.read_csv("../input/coling/agr_en_train.csv")
# #df=pd.read_csv("dev.csv")

# df.columns=['id','text','tag']
# #easier nonascii removal
# df['text'] = df['text'].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in x]))
# df['text']=df['text'].map(lambda x: x.lower())
# df['text'] = df['text'].str.replace('[^\w\s]','')

# #this may take time so i have comented
# # df['text']=df['text'].astype(str).apply(lambda row: re.sub(r'(?<!\S)\d+(?!\S)', lambda x: p.number_to_words(x.group()), row))

# from nltk.corpus import stopwords
# stop = stopwords.words('english')
# #to have both versions in dataframe
# df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# #i dont like stemming that much

# from nltk.stem import SnowballStemmer
# stemmer = SnowballStemmer('english')
# df['text'] = df['text'].apply(lambda x: ' '.join([stemmer.stem(token) for token in x.split()]))                                       

# def consecutive(text):
#     text = re.compile(r'\.{1,}').sub(' ', text)
#     text = re.compile(r'\,{1,}').sub(' ', text)
#     text = re.compile(r'\?{1,}').sub(' ', text)
#     text = re.compile(r'\!{1,}').sub(' ', text)
#     return text

# from nltk.stem import WordNetLemmatizer
# lemma = WordNetLemmatizer()
# df['text'] = df['text'].apply(lambda x: ' '.join([lemma.lemmatize(word) for word in x.split()]))                                       
# df['text'] =df['text'].apply(consecutive)



# df.to_csv('train_preproc.csv')
# #df.to_csv('dev_preproc.csv')