## **Installing and Importing Packages**

In [None]:
!pip install stop-words
!pip install nltk
!pip install scikit-multilearn
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stop-words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-py3-none-any.whl size=32910 sha256=0de4c7646e4d7ae4234e614dd342de70a3438f0058fdea509101a94d5c1fd30d
  Stored in directory: /root/.cache/pip/wheels/fb/86/b2/277b10b1ce9f73ce15059bf6975d4547cc4ec3feeb651978e9
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[K     |██████████████████████████████

In [None]:
import pandas as pd;
import numpy as np;
import matplotlib.pyplot as plt;
import seaborn as sns;
import nltk
import copy
import re
import contractions
import tensorflow
from tensorflow import keras
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.multiclass import OneVsRestClassifier
# from sklearn.model_selection import cv
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline,make_union
from sklearn.preprocessing import MultiLabelBinarizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag;
from nltk.corpus import stopwords, wordnet;
from nltk.tokenize import WhitespaceTokenizer
nltk.download('omw-1.4')
#from stop_words import get_stop_words
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
 from google.colab import drive
 drive.mount('/content/drive/')

## **Initial data analysis**

In [None]:
train = pd.read_csv('/content/drive/MyDrive/ML_Project/train.csv')
test = pd.read_csv('/content/drive/MyDrive/ML_Project/test.csv')
#train.head()
test.head()

In [None]:
targets = list(train.columns[2:])
df_targets = train[targets].copy()

# How many rows are toxic? 
toxic_rows = df_targets.sum(axis=1)
toxic_rows = (toxic_rows > 0)

# Create overall any_label feature
targets.append('any_label')
df_targets['any_label'] = toxic_rows

count_dic = {}
for comment_type in targets:
    counts = list()
    others = list(targets)
    df_selection = df_targets[(df_targets[comment_type]==1)]
    others.remove(comment_type)
    counts.append(('total', len(df_selection)))
    for other in others:
        counts.append((other, df_selection[other].sum()))
    count_dic[comment_type] = counts


del(df_selection)

def heatmap(df, title):
    plt.figure('heatmap', figsize=[10,10])
    plt.title(title)
    df_corr = df.corr()
    #df_corr = np.triu(df_corr, k=1)
    sns.heatmap(df_corr, vmax=0.6, square=True, annot=True, cmap='YlOrRd')
    plt.yticks(rotation = 45)
    plt.xticks(rotation = 45)
    plt.show()

heatmap(df_targets, 'Comment Type Heatmap')


print('Training Data Comment Breakdown')
print('=====\n')

print('%d out of %d comments, or %.2f%%, are classified as toxic.' % 
     (np.sum(toxic_rows), len(train), (np.sum(toxic_rows)/len(train))*100))

totals = []
for key, value in count_dic.items():
    totals.append(value[0][1])
    print('\n%d %s comments. (%.2f%% of all data.)' % (value[0][1], key, (value[0][1]/len(train))*100))
    for cnt in value[1:]:
        print('- %d or %.2f%% were also %s.' % (cnt[1], (cnt[1]/value[0][1])*100, cnt[0]))
    

plt.figure('Comment Type Counts', figsize=[8,6])
plt.title('Comment Type Counts')
sns.barplot(x=list(count_dic.keys()), y=totals)
plt.show()

In [None]:
def feature_engineering(dataframe, sparse=0): 
    
    # Comment length
    dataframe['length'] = dataframe.text.apply(lambda x: len(x))
    

    # Capitalization percentage
    def pct_caps(s):
      Caps=0
      total = 0
      for c in s:
        if c.isupper() == 1:
          Caps+=1
      for c in s:
        if c.isalpha() == 1:
          total+=1
      return Caps/(total+1)
    dataframe['caps'] = dataframe.text.apply(lambda x: pct_caps(x))

    # Mean Word length 
    def word_length(s):
        s = s.split(' ')
        return np.mean([len(w) for w in s if w.isalpha()])
    dataframe['word_length'] = dataframe.text.apply(lambda x: word_length(x))

    # # Average number of exclamation points 
    # dataframe['exclamation'] = dataframe.text.apply(lambda s: len([c for c in s if c == '!']))

    # # Average number of question marks 
    # dataframe['question'] = dataframe.text.apply(lambda s: len([c for c in s if c == '?']))
    
    # Normalize
    for label in ['length', 'caps', 'word_length']:
        minimum = dataframe[label].min()
        diff = dataframe[label].max() - minimum
        dataframe[label] = dataframe[label].apply(lambda x: (x-minimum) / (diff))
    return dataframe

In [None]:
output = train.iloc[:, 2:8]   #Splitting the data into input and output data. 
input = train.iloc[:, 0:2]
testInput = test.iloc[:, 0:2]

In [None]:
train = feature_engineering(input)

In [None]:
test = feature_engineering(testInput)

In [None]:
test.head()

##**Preprocessing**

In [None]:
# def summarizer(text): #Takes text as a comment and finds the summary of the text and returns the summary as tokens.
#   def tokenize(text):
#     return word_tokenize(text)   #To tokenize the words.
#   def removeURL(text):
#     return re.sub(r"http\S+", "", text)
#   def toLower(text):            #To convert all words into lowercase
#     return [text.lower() for text in text]
#   def removePunc(text):          #To remove punctuation marks from the text.
#     w = []
#     for word in text:
#       if(word.isalnum()):
#         w.append(word)
#     return w
#   def modify(text):  #Remove words of length< 2
#     w = []
#     for word in text:
#       if(len(word) > 2):
#         w.append(word)
#     return w
#   def stopwordRem(text):         #To remove stopwords from the text.
#     stopword = stopwords.words('english')
#     text = list(filter(lambda word: word not in stopword, text))
#     return text
#   def remove_numbers(text):
#       pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
#       return re.sub(pattern, '', text)
#   def remove_spaces&singlechar(text):
#       sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
#       sentence = re.sub(r'\s+', ' ', sentence)
#       return sentence
#   def lemmatize(text):          #To lemmatize the text using the parts of speech tag.
#     def PartOfSpeechTag(text):
#       tag = pos_tag([text])
#       pos = tag[0][1]
#       tag_to_map = {"JJ": wordnet.ADJ,
#                   "NN": wordnet.NOUN,
#                   "VBG": wordnet.VERB,
#                   "RB": wordnet.ADV}
#       return tag_to_map.get(pos, wordnet.NOUN)
#     lemma = WordNetLemmatizer()
#     lemmatized_text = ""
#     for w in text:
#       pos = PartOfSpeechTag(w)
#       lemmatized_text += lemma.lemmatize(w, pos) + " "
#     return lemmatized_text
    
#   text = removeURL(text)
#   # text = remove_numbers(text)
#   text = tokenize(text)
#   # text = modify(text)
#   text = toLower(text)
#   text = removePunc(text)
#   # text = stopwordRem(text)
#   # text = lemmatize(text)
#   return text

In [None]:
# preprocessed_comments = []  #Calling the summarizer function and passing the text into the summarizer Written by: Tejas Sharma
# for i in range(len(X_train)):
#   if(i%1000 == 1):
#     print(i)
#   text = X_train._get_value(i, 'text', takeable=False)
#   preprocessed_comments.append(summarizer(text))
# preprocessed_comments

In [None]:
# preprocessed_test_comments = []
# for i in range(len(X_test)):
#   if((i%1000) == 1):
#     print(i)
#   text = X_test._get_value(i+71451, 'text', takeable=False)
#   preprocessed_test_comments.append(summarizer(text))
# preprocessed_test_comments

In [None]:
# preprocessed_comments = []  #Calling the summarizer function and passing the text into the summarizer for training comments
# input['clean_text'] = input.text.apply(lambda text: summarizer(text))
# input.head()

In [None]:
# testInput['clean_text'] = testInput.text.apply(lambda x: summarizer(x))
# testInput.head()

In [None]:
# input.to_csv("preprocessed_Data.csv")
# testInput.to_csv("preprocessed_VData.csv")
# !cp preprocessed_Data.csv /content/drive/MyDrive/ML_Project
# !cp preprocessed_VData.csv /content/drive/MyDrive/ML_Project

In [None]:
# preprocessed_Data = pd.DataFrame(preprocessed_comments)
# preprocessed_Data.to_csv("preprocessed_Data.csv")

In [None]:
inputTest = pd.read_csv("/content/drive/MyDrive/ML_Project/preprocessed_VData.csv")
inputTest["clean_text"].fillna("no text", inplace = True)
test_Comments=[]
for i in range(len(inputTest)):
  test_Comments.append(inputTest.iloc[i]["clean_text"])

In [None]:
input = pd.read_csv("/content/drive/MyDrive/ML_Project/preprocessed_Data.csv")
input["clean_text"].fillna("no text", inplace = True)
Comments=[]
for i in range(len(input)):
  Comments.append(input.iloc[i]["clean_text"])
input["clean_text"].isna().sum()

In [None]:
#Performing the train-test split. For the sake of testing the model, we used 80-20 split.
X_train = input.iloc[:71450]
X_test = input.iloc[71451:]
Y_train = output.iloc[:71450]
Y_test = output.iloc[71451:]
# X_test = inputTest.iloc[:] #For running it on the actual test dataset.

In [None]:
# X_train = X_train.iloc[:,7-8].fillna(' ')
# X_test = X_test.iloc[:7-8].fillna(' ')

In [None]:
X_test.iloc[:,-1]

## **Training the model**

In [None]:
# import gensim
# print("gensim version:", gensim.__version__)

# word2vec_path = "/content/drive/MyDrive/ML_Project/GoogleNews-vectors-negative300.bin"

# # we only load 200k most common words from Google News corpus 
# word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True, limit=200000) 

In [None]:
# def get_average_vec(tokens_list, vector, generate_missing=False, k=300):
#     """
#         Calculate average embedding value of sentence from each word vector
#     """
#     # for word in tokens_list:
#     #   print(word)
#     if len(tokens_list)<1:
#         return np.zeros(k)
    
#     if generate_missing:
#         vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
#     else:
#         vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    
#     length = len(vectorized)
#     summed = np.sum(vectorized, axis=0)
#     averaged = np.divide(summed, length)
#     return averaged

# def get_embeddings(vectors, text, generate_missing=False, k=300):
#     """
#         create the sentence embedding
#     """
#     print(len(text))
#     emdeddings = []
#     # print(text[4])
#     for i in range(len(text)):
#       # text[i] = word_tokenize(text[i])
#       emdeddings.append(get_average_vec(text[i], vectors, generate_missing=generate_missing, k=k))
#     return emdeddings

In [None]:
# embeddings_word2vec = get_embeddings(word2vec_model, preprocessed_test_comments, k=300)
# embeddings_word2vec_train = get_embeddings(word2vec_model, preprocessed_comments, k=300)
# print("The sentence: \"%s\" got embedding values: " % preprocessed_test_comments[0])
# print(embeddings_word2vec[0])

In [None]:
new_features = list(X_train.columns[3:6])
new_test_features = list(X_test.columns[3:6])
new_features = csr_matrix(X_train[new_features].values.astype(float))
# new_tf = csr_matrix[X_test[new_test_features].values]
print(new_features)

In [None]:
wordVectorizer = TfidfVectorizer(sublinear_tf = True,strip_accents = 'unicode', min_df = 2, max_df = 0.5,max_features = 20000, lowercase = True, analyzer = "word", ngram_range=(1,2), 
                                 stop_words = "english", dtype=np.float32) #Takes words can returns a matrix representing the text in terms of the occurences and frequencies of the vocabulary of the whole dataset.
charVectorizer = TfidfVectorizer(sublinear_tf = True,strip_accents = 'unicode', min_df = 2, max_df = 0.5, max_features = 10000, lowercase = True, analyzer = "char", ngram_range=(3,5), 
                                 stop_words = "english", dtype=np.float32)
Vectorizer = make_union(wordVectorizer, charVectorizer, n_jobs=-1)
X_train_TFrep = Vectorizer.fit_transform(X_train.iloc[:,-1]) #first fitting our estimator to the data to get a matrix representation->transforming count-matrix to tf-idf rep.
X_test_TFrep= Vectorizer.transform(X_test.iloc[:, -1])
X_test_TFrep.shape

In [None]:
X_test_TFrep

In [None]:
# Vectorizer = TfidfVectorizer(max_features = 10000, lowercase = True, analyzer = "char", ngram_range=(3,7), stop_words = "english", dtype=np.float32) #Takes words can returns a matrix representing the text in terms of the occurences and frequencies of the vocabulary of the whole dataset.
# X_train_2 = Vectorizer.fit_transform(X_train.iloc[:,7-8]) #first fitting our estimator to the data to get a matrix representation->transforming count-matrix to tf-idf rep.
# X_test_2 = Vectorizer.transform(X_test.iloc[:, 7-8])
# X_test_2.shape

In [None]:
# X_train_TFrep = hstack([X_train_word, X_train_2])
# X_test_TFrep = hstack([X_test_word, X_test_2])

In [None]:
# X_train_word[4]

In [None]:
def merge_features(text, data, engineered_features,frame, sparse = 0):
  
    new_features = csr_matrix(frame[engineered_features].values)
    if np.isnan(new_features.data).any():
        new_features.data = np.nan_to_num(new_features.data)
    return hstack([text, new_features])

In [None]:
# merge_features(X_train_TFrep, X_train, new_features, X_train)

In [None]:
df = X_test.iloc[:, 1:2]
df

## **Testing and evaluating the model**

In [None]:
#OVR is a multiclass classifier. It creates a classifier for each column like for eg: [harsh, rest] and LR is used for each classifier to find the predicted probabilities.

classifier = OneVsRestClassifier(LogisticRegression(solver = 'saga'))

classifier.fit(X_train_TFrep, Y_train)
probability = classifier.predict_proba(X_test_TFrep)
prediction = classifier.predict(X_test_TFrep)

# classifier.fit(merge_features(X_train_TFrep, X_train, new_features, X_train), Y_train)
# probability = classifier.predict_proba(merge_features(X_test_TFrep, X_test, new_features, X_test))
# prediction = classifier.predict(merge_features(X_test_TFrep, X_test, new_features, X_test))


In [None]:
prediction.shape
df = X_test.iloc[:, 1:2]
df2 = pd.DataFrame(probability)
df = pd.concat([df, df2], axis=1, ignore_index=True)
df.to_csv("2tfidf.csv")
df.head()

In [None]:
print('Test accuracy is {}'.format(roc_auc_score(Y_test, prediction)))

In [None]:
!cp 2tfidf5.csv /content/drive/MyDrive/ML_Project

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=50, criterion= "entropy", random_state = 0)

model.fit(X_train_TFrep, Y_train)
# model.fit(merge_features(X_train_TFrep, X_train, new_features, X_train), Y_train)

In [None]:
prediction = model.predict(X_test_TFrep)
# prediction = model.predict(merge_features(X_test_TFrep, X_test, new_features, X_test))
print("Accuracy score is: " + str(roc_auc_score(Y_test, prediction)))

In [None]:
from xgboost import XGBClassifier
# from sklearn.preprocessing.MultiLabelBinarizer import MultiLabelBinarizer
clf = OneVsRestClassifier(XGBClassifier())

clf.fit(X_train_TFrep, Y_train)
# clf.fit(merge_features(X_train_TFrep, X_train, new_features, X_train), Y_train)

In [None]:
prediction = clf.predict(X_test_TFrep)
# prediction = clf.predict(merge_features(X_test_TFrep, X_test, new_features, X_test))
print("Accuracy score is: " + str(roc_auc_score(Y_test, prediction)))

In [None]:
# #OVR is a multiclass classifier. It creates a classifier for each column like for eg: [harsh, rest] and LR is used for each classifier to find the predicted probabilities.
# classifier = OneVsRestClassifier(LogisticRegression(solver = 'saga'))
# categories = ["harsh", "extremely_harsh", "vulgar", "disrespect", "threatening", "targeted_hate"]
# classifier.fit(embeddings_word2vec_train, Y_train)
# probability = classifier.predict_proba(embeddings_word2vec)
# probability = np.round_(probability, decimals = 2)
# prediction = classifier.predict(embeddings_word2vec)
# print('Test accuracy is {}'.format(roc_auc_score(Y_test, prediction)))
# #After fitting and predicting the data, we dump the outputs into a .csv
# # print(probability)
# prediction.shape
# df = pd.DataFrame()
# df = X_test.iloc[:, 0:1]
# df2 = pd.DataFrame(probability)
# df = pd.concat([df, df2], axis=1, ignore_index=True)
# # df.info()
# df.to_csv("data1.csv")
# df.head()

We have implemented the same OVR classifier but with multinomial NB to predict the probabilities

In [None]:
# classifier2 = OneVsRestClassifier(MultinomialNB(fit_prior = True, class_prior = None))
# categories = ["harsh", "extremely_harsh", "vulgar", "disrespect", "threatening", "targeted_hate"]

# classifier2.fit(merge_features(X_train_TFrep, X_train, new_features, X_train), Y_train)
# # compute the testing accuracy
# prediction = classifier2.predict(merge_features(X_test_TFrep, X_test, new_features, X_test))
# print('Test accuracy is {}'.format(roc_auc_score(Y_test, prediction)))
# print(prediction)

An ensemble of both the models implemented above.

Done to combine the predictions of multiple base estimators/models which are LR and MNB here to improve overall accuracies and give best result.

In [None]:
from sklearn.multioutput import ClassifierChain

base_lr = LogisticRegression(solver = "saga")
chain = ClassifierChain(base_lr, order='random', random_state=0)
chain.fit(X_train_TFrep, Y_train)
probability = chain.predict_proba(X_test_TFrep)
prediction = chain.predict(X_test_TFrep)

# chain.fit(merge_features(X_train_TFrep, X_train, new_features, X_train), Y_train)
# prediction = chain.predict(merge_features(X_test_TFrep, X_test, new_features, X_test))
# probability = np.round_(probability, decimals = 2)

# print('Test accuracy is {}'.format(roc_auc_score(Y_test, prediction)))

In [None]:
ensemble = VotingClassifier(estimators =[('lr', classifier), ('chain', chain)],voting='soft')
categories = ["harsh", "extremely_harsh", "vulgar", "disrespect", "threatening", "targeted_hate"]
for category in categories:
  ensemble.fit(X_train_TFrep, Y_train[category])
  a = ensemble.predict_proba(X_test_TFrep)
  prediction = ensemble.predict(X_test_TFrep)
  print("for category: " + category + " " + str(roc_auc_score(Y_test[category], prediction)))
# print("acc score: " + str(roc_auc_score(Y_test, prediction)))
#create our voting classifier, inputting our models
# ensemble = VotingClassifier(estimators, voting=’hard’)

In [None]:
# from sklearn.multioutput import ClassifierChain
# base_lr = LogisticRegression(solver = "saga")
# chain = ClassifierChain(base_lr, order='random', random_state=0)
# # chain.fit(embeddings_word2vec_train, Y_train)
# chain.fit(merge_features(embeddings_word2vec_train, X_train, new_features, X_train), Y_train)
# prediction = chain.predict(merge_features(embeddings_word2vec, X_test, new_features, X_test))
# # probability = chain.predict_proba(embeddings_word2vec)
# # probability = np.round_(probability, decimals = 2)
# # prediction = chain.predict(embeddings_word2vec)
# print('Test accuracy is {}'.format(roc_auc_score(Y_test, prediction)))