In [1]:
import numpy as np
import pandas as pd 
import tensorflow as tf
import tensorflow.keras as keras
from sklearn.model_selection import train_test_split
import re
import string
import unicodedata
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vassi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In order to download the 'train.csv' and 'test.csv' files, follow the link: https://drive.google.com/drive/folders/1GtFMaqWYieUR9A6D3gO_u8QjsBVkIi8H?usp=sharing

In [2]:
location = 'C:/Users/vassi/Desktop/'
train = pd.read_csv(location+'train.csv')
test = pd.read_csv(location+'test.csv')
train = train[~((train['toxic']==0) & (train['severe_toxic']==0) & (train['obscene']==0) & (train['threat']==0) & (train['insult']==0) & (train['identity_hate']==0))]
train = train.reset_index().drop(['index', 'id'], axis=1)
test = test.reset_index()

In [3]:
def remove_stopwords(text):
    sw_nltk = stopwords.words('english')
    sw_nltk.append('st')
    sw_nltk.append('nd')
    sw_nltk.append('rd')
    sw_nltk.append('th')
    sw_nltk.append('rt')
    words = [word for word in text.split() if word.lower() not in sw_nltk]
    new_text = " ".join(words)
    return new_text

def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation'''
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = ''.join(i for i in text if not i.isdigit())
    return text

In [5]:
# delete nan messages and clean the messages

train=train[~train['comment_text'].isna()]
train['comment_text']=train['comment_text'].apply(lambda x: clean_text(x))

#remove empty comments and stopwords
train = train[~train['comment_text'].isin(['', ' ', np.nan, None, []])]
train['comment_text']=train['comment_text'].apply(lambda x: remove_stopwords(x))

#remove empty comments after the removal of stopwords
train = train[~train['comment_text'].isin(['', ' ', np.nan, None, []])]

In [6]:
X_train, X_test = train_test_split(train, random_state=420, test_size=0.2, shuffle=True)
y_train = X_train.drop(['comment_text'],axis=1)
X_train = X_train.comment_text
y_test = X_test.drop(['comment_text'],axis=1)
X_test = X_test.comment_text

In [7]:
df_toxic = train.drop(['comment_text'], axis=1)
counts = []
categories = list(df_toxic.columns.values)

In [9]:
# Define a pipeline combining a text feature extractor with multi lable classifier

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
# train the model using X_dtm & y
NB_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = NB_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

Test accuracy is 0.41479198767334363


In [8]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

# train the model using X_dtm & y
SVC_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = SVC_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

Test accuracy is 0.4366718027734977


In [10]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

# train the model using X_dtm & y
LogReg_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = LogReg_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

Test accuracy is 0.47889060092449925


In [11]:
from sklearn.ensemble import RandomForestClassifier

RanFor_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(RandomForestClassifier(), n_jobs=1)),
            ])

# train the model using X_dtm & y
RanFor_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = RanFor_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

Test accuracy is 0.462557781201849


In [12]:
df2 = pd.read_csv(location+'Chat.csv')

In [14]:
X_train, X_test = train_test_split(df2, random_state=420, test_size=0.2, shuffle=True)
y_train = X_train.most_common_report_reason
X_train = X_train.message
y_test = X_test.most_common_report_reason
X_test = X_test.message

In [15]:
# Define a pipeline combining a text feature extractor with multi lable classifier

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
# train the model using X_dtm & y
NB_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = NB_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

Test accuracy is 0.33561192448344035


In [16]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

# train the model using X_dtm & y
SVC_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = SVC_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

Test accuracy is 0.33273891825069674


In [17]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

# train the model using X_dtm & y
LogReg_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = LogReg_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

Test accuracy is 0.3347633754161982


In [None]:
from sklearn.ensemble import RandomForestClassifier

RanFor_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(RandomForestClassifier(), n_jobs=1)),
            ])

# train the model using X_dtm & y
RanFor_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = RanFor_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))