In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re 
import scipy
from scipy import sparse
from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 
import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge
import zipfile
import string
import nltk
import string
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer() 

In [None]:
!pip install seaborn==0.11.0

data collecting

In [None]:
train_csv_zip_path = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'
with zipfile.ZipFile(train_csv_zip_path) as zf:
    zf.extractall('./')


In [None]:
train_csv_path = './train.csv'
sample_sub_path = '../input/jigsaw-toxic-severity-rating/sample_submission.csv'
comments_to_score_path = '../input/jigsaw-toxic-severity-rating/comments_to_score.csv'
val_path='../input/jigsaw-toxic-severity-rating/validation_data.csv'

In [None]:
df_train = pd.read_csv("./train.csv")
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
df_train.head()

In [None]:
df_sub.head()

In [None]:
for col in ['toxic','severe_toxic','obscene','threat','insult','identity_hate']:
#     print(f'------------------------{col}-----------------------')
    print(col.center(40, '.'))
    display(df_train.loc[df_train[col]==1,['comment_text',col]].sample(2))

data preprocessing

follow the instructions: https://medium.com/analytics-vidhya/text-cleaning-in-natural-language-processing-nlp-bea2c27035a6

Data Preprocessing must include the follows:

Removing HTML characters,ASCII

Convert Text to Lowercase

Remove Punctuation's

Remove Stop words

Tokenization

Stemming vs Lemmatization


In [None]:
def clean_text(text):
#replace the html characters with " "
    text=re.sub('<.*?>', ' ', text)  
#remove the punctuations
    text = text.translate(str.maketrans(' ',' ',string.punctuation))
#consider only alphabets and numerics
    text = re.sub('[^a-zA-Z]',' ',text)  
#replace newline with space
    text = re.sub("\n"," ",text)
#convert to lower case
    text = text.lower()
#split and join the words
    text=' '.join(text.split())
    return text

def stopwords(input_text, stop_words):
    word_tokens = word_tokenize(input_text) 
    output_text = [w for w in word_tokens if not w in stop_words]
    output = [] 
    for w in word_tokens: 
        if w not in stop_words:
            output.append(w)
            
    text = ' '.join(output)
    return text



In [None]:
unrelevant_words = ['wiki','wikipedia','page']
#Clean step 1, 2 and 3
df_train['comment_text'] = df_train['comment_text'].apply(lambda x: ''.join([w for w in clean_text(x) if w not in unrelevant_words]))

#Clean Step 4
df_train['comment_text'] = df_train['comment_text'].apply(lambda x: ''.join([w for w in stopwords(x,stop_words)]))

#Clean Step 5
df_train['comment_text'] = df_train['comment_text'].apply(lambda x: ''.join([w for w in lemmatizer.lemmatize(x)]))

df_train.head()

reference:

https://medium.com/analytics-vidhya/text-cleaning-in-natural-language-processing-nlp-bea2c27035a6

In [None]:
for col in ['toxic','severe_toxic','obscene','threat','insult','identity_hate']:
    print(col.center(40, '.'))
    display(df_train.loc[df_train[col]==1,['comment_text',col]].sample(2))

In [None]:
df_train['y'] = (df_train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
df_train_binary = df_train[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df_train_binary.sample(5)

In [None]:
 display(df_train_binary.loc[df_train_binary['y']==1,['text','y']].sample(5))

In [None]:
 display(df_train_binary.loc[df_train_binary['y']==0,['text','y']].sample(5))

In [None]:
df_lt=df_train_binary.loc[df_train_binary['y']==0]
df_lt.head()
# print(len(df_lt))

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# text_lt = " ".join(t for t in df["less_toxic"])

text_lt = " ".join(t for t in df_lt['text'])
print ("There are {} words in the combination of all less_toxic reviews.".format(len(text_lt)))

#Generate a word cloud image
wordcloud = WordCloud(stopwords=STOPWORDS,background_color="white").generate(text_lt)

# Display the generated image:
# the matplotlib way:
plt.figure(figsize=[10,10])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
df_mt=df_train_binary.loc[df_train_binary['y']==1]
df_mt.head()

In [None]:
text_mt = " ".join(t for t in df_mt['text'])
print ("There are {} words in the combination of all more_toxic reviews.".format(len(text_mt)))

#Generate a word cloud image
wordcloud = WordCloud(stopwords=STOPWORDS,background_color="black").generate(text_mt)

# Display the generated image:
# the matplotlib way:
plt.figure(figsize=[10,10])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

**Unbalanced dataset**

In [None]:
df_train_binary['y'].value_counts(normalize=True)

In [None]:
toxic_len = (df_train_binary['y'] == 1).sum()
print(toxic_len)

In [None]:
df_train_balanced = df_train_binary[df_train_binary['y'] == 0].sample(n=toxic_len)
df_train_balanced['y'].value_counts(normalize=True)

**Balanced dataset**

In [None]:
df_train_b = pd.concat([df_train_binary[df_train_binary['y'] == 1], df_train_balanced])
df_train_b['y'].value_counts()

**TF-IDF**

In [None]:
# vec = TfidfVectorizer()

In [None]:
# X = vec.fit_transform(df_train_b['text'])
# X

In [None]:
# from sklearn.naive_bayes import MultinomialNB
# model = MultinomialNB()
# model.fit(X, df_train_b['y'])

In [None]:
# df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
# df_val.head()

In [None]:
# unrelevant_words = ['wiki','wikipedia','page']
# #Clean step 1, 2 and 3
# df_val['less_toxic'] = df_val['less_toxic'].apply(lambda x: ''.join([w for w in clean_text(x) if w not in unrelevant_words]))

# #Clean Step 4
# df_val['less_toxic'] = df_val['less_toxic'].apply(lambda x: ''.join([w for w in stopwords(x,stop_words)]))

# #Clean Step 5
# df_val['less_toxic'] = df_val['less_toxic'].apply(lambda x: ''.join([w for w in lemmatizer.lemmatize(x)]))

# df_val.head()

In [None]:
# df_val['more_toxic'] = df_val['more_toxic'].apply(lambda x: ''.join([w for w in clean_text(x) if w not in unrelevant_words]))

# #Clean Step 4
# df_val['more_toxic'] = df_val['more_toxic'].apply(lambda x: ''.join([w for w in stopwords(x,stop_words)]))

# #Clean Step 5
# df_val['more_toxic'] = df_val['more_toxic'].apply(lambda x: ''.join([w for w in lemmatizer.lemmatize(x)]))

# df_val.head()

In [None]:
# X_less_toxic = vec.transform(df_val['less_toxic'])
# X_more_toxic = vec.transform(df_val['more_toxic'])

In [None]:
# p1 = model.predict_proba(X_less_toxic)
# p2 = model.predict_proba(X_more_toxic)

In [None]:
# (p1[:, 1] < p2[:, 1]).mean()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df_train_b['text']).toarray()
labels = df_train_b['y']
features.shape

In [None]:
# features_less_toxic = (df_val['less_toxic'])
# tfidf.transform(features_less_toxic).toarray()
# X_val_less_toxic =tfidf.fit_transform(features_less_toxic).toarray()
# # features_more_toxic = tfidf.fit_transform(df_val['more_toxic']).toarray()
# X_val_less_toxic.shape

In [None]:
# from io import StringIO
# col = ['y', 'text']
# df = df_train_binary[col]
# df = df[pd.notnull(df['text'])]
# df.columns = ['y', 'text']
# df['category_id'] = df['y'] .factorize()[0]
# category_id_df = df[['y', 'category_id']].drop_duplicates().sort_values('category_id')
# category_to_id = dict(category_id_df.values)
# id_to_category = dict(category_id_df[['category_id', 'y']].values)
# df.head()


In [None]:
# from sklearn.feature_selection import chi2
# import numpy as np
# N = 2
# for y,category_id in sorted(category_to_id.items()):
#   features_chi2 = chi2(features, labels == category_id)
#   indices = np.argsort(features_chi2[0])
#   feature_names = np.array(tfidf.get_feature_names())[indices]
#   unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
#   bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
#   print("# '{}':".format(y))
#   print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
#   print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:

# features_less_toxic = tfidf.transform(df_val['less_toxic'])
# features_less_toxic =features_less_toxic.fit_transform(president)


In [None]:

from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

svm = LinearSVC()
clf = CalibratedClassifierCV(svm) 
clf.fit(features, labels)




In [None]:
import seaborn as sns
y_pred = clf.predict(features)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(labels, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
cmap = "tab20"
sns.heatmap(conf_mat, annot=True, fmt='d',cmap=cmap)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
df_test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
df_test.head()

In [None]:
df_test.info()

In [None]:
X_test = df_test['text']
X_test.head()

In [None]:
# X_test= tfidf.fit_transform(df_test['text']).toarray()
X_test= tfidf.transform(df_test['text'])
X_test.shape

In [None]:
# svm = LinearSVC()
# clf = CalibratedClassifierCV(svm) 
# clf.fit(features, labels)
y_test = clf.predict_proba(X_test)
len(y_test)

In [None]:
df_test['score'] = y_test[:, 1]
df_test['score'].head()

In [None]:
df_test.head()

In [None]:
df_test[['comment_id', 'score']].to_csv("submission.csv", index=False)

In [None]:
# model = LogisticRegression(random_state=0)
# # X_train, y_train = train_test_split(features, labels)
# model.fit(features, labels)
# y_pred = model.predict(X_test)
# len(y_pred)