In [None]:
# Importing necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

## Project 1 - NLP and Text Classification

For this project you will need to classify some angry comments into their respective category of angry. The process that you'll need to follow is (roughly):
<ol>
<li> Use NLP techniques to process the training data. 
<li> Train model(s) to predict which class(es) each comment is in.
    <ul>
    <li> A comment can belong to any number of classes, including none. 
    </ul>
<li> Generate predictions for each of the comments in the test data. 
<li> Write your test data predicitions to a CSV file, which will be scored. 
</ol>

You can use any models and NLP libraries you'd like. Think aobut the problem, look back to see if there's anything that might help, give it a try, and see if that helps. We've regularly said we have a "toolkit" of things that we can use, we generally don't know which ones we'll need, but here you have a pretty simple goal - if it makes it more accurate, it helps. There's not one specific solution here, there are lots of things that you could do. 

## Training Data

Use the training data to train your prediction model(s). Each of the classification output columns (toxic to the end) is a human label for the comment_text, assessing if it falls into that category of "rude". A comment may fall into any number of categories, or none at all. Membership in one output category is <b>independent</b> of membership in any of the other classes (think about this when you plan on how to make these predictions - it may also make it easier to split work amongst a team...). 

In [None]:
train_df = pd.read_csv("train.csv.zip")
train_df.head()

In [None]:
train_df["non_toxic"] = train_df.iloc[:,2:8].apply(lambda x: 1 if (sum(x)==0) else 0, axis=1)
train_df.head(10)

In [None]:
total={}
for col in train_df.iloc[:,2:].columns:
    total[col]=train_df[col].value_counts()[1]
    print(f"{col}: {total[col]}\n")

In [None]:
train_df['classes']=train_df.iloc[:,2:8].sum(axis=1)
train_df

In [None]:
label_totals={}
for label in train_df['classes'].unique():
    label_totals[label]=train_df['classes'].value_counts()[label]

label_totals

In [None]:
keys=[key for key in label_totals.keys()]
values=[key for key in label_totals.values()]
keys1=keys.copy()
values1=values.copy()
keys1.pop(0)
values1.pop(0)

In [None]:
plt.figure(figsize=(20,15))

fr = plt.subplot(2,1,1)
plt.bar(keys,values)

fr.set_xlabel("Total Number of Comments", fontsize=15)
fr.set_ylabel("Number of Classes comments belong to", fontsize=15)
fr.set_title("Plot including unlabeled(0) comments ", fontsize=17)


se = plt.subplot(2,1,2)
plt.bar(keys1, values1)

se.set_xlabel("Total Number of Comments", fontsize=15)
se.set_ylabel("Number of Classes comments belong to", fontsize=15)
se.set_title("Plot excluding unlabeled(0) comments", fontsize=17)

In [None]:
plt.figure(figsize=(20,15))
rude=list(total.values())

fr = plt.subplot(2,1,1)
plt.bar(x = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate','non_toxic'],height=total.values())

fr.set_xlabel("Total Number of Comments", fontsize=15)
fr.set_ylabel("Number of Classes comments belong to", fontsize=15)
fr.set_title("distribution of classes with non toxic values", fontsize=17)


se = plt.subplot(2,1,2)
plt.bar(x = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate'], height=rude[:-1])

se.set_xlabel("Total Number of Comments", fontsize=15)
se.set_ylabel("Number of Classes comments belong to", fontsize=15)
se.set_title("distribution of classes without non toxic values", fontsize=17)

In [None]:
processed_data=train_df.copy()

In [None]:
processed_data['comment_text'].tolist()

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

In [None]:
# Preprocess the text data
def preprocess(text):

    #Removing IP address
    
    text=re.sub(r"([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})\.([0-9]{1,3})","",text)

    #Getting ride of the URLs with space even if they are in paranthesis 

    text=re.sub(r"\S*https?:\S*|\s*www\.\s*", "", text)

    text=re.sub(r"\"\"", "\"",text)  # replacing "" with "

    text=re.sub(r"^\"", "",text)      # removing quotation from start and the end of the string

    #Remove special characters and numbers 
    text=re.sub(r'\s*[^a-zA-Z]\s*',' ',text)

    #Removing extra spaces in the text 
    text=re.sub(r"\s\s+", " ",text)

    # Remove stopwords
    words = word_tokenize(text)
    words = [w for w in words if not w in stop_words]
    return text

In [None]:
processed_data['comment_text']=processed_data['comment_text'].apply(preprocess)

In [None]:
#working_data=processed_data.copy()
#C_indexs = processed_data[processed_data["classes"]==0].index
#C_indexs.shape

In [None]:
#Keep 5000 of non_toxic comments and drop the rest
#drop_ind=np.random.choice(C_indexs, size=138346, replace=False)
#working_data.drop(drop_ind, inplace=True)


In [None]:
def Graph(A,B):
    classes_A={}
    for col in A.iloc[:,1:8].columns:
        classes_A[col]=A[col].value_counts()[1]
    classes_B={}
    for col in B.iloc[:,1:8].columns:
        classes_B[col]=B[col].value_counts()[1]
    multi_class_A={}
    for value in A['classes'].unique():
        multi_class_A[value]=A['classes'].value_counts()[value]
    multi_class_B={}
    for value in B['classes'].unique():
        multi_class_B[value]=B['classes'].value_counts()[value]

    key_A=[key for key in multi_class_A.keys()]
    values_A=[key for key in multi_class_A.values()]

    key_B=[key for key in multi_class_B.keys()]
    values_B=[key for key in multi_class_B.values()]
    
    plt.rcParams["figure.figsize"] = (20,20)
    t_left=plt.subplot(2,2,1)
    plt.bar(x=['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate','non_toxic'], height=classes_A.values())
    t_left.set_title("Distribution of classes before \ndeleting non toxic samples", fontsize=22)
    t_left.set_xlabel("Classes", fontsize=15)
    t_left.set_ylabel("Frequency", fontsize=15)
    t_right=plt.subplot(2,2,2)
    plt.bar(x=['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate','non_toxic'], height=classes_B.values())
    t_right.set_title("Distribution of classes After \ndeleting non toxic samples", fontsize=22)
    t_right.set_xlabel("Classes", fontsize=15)
    t_right.set_ylabel("Frequency", fontsize=15)
    d_left=plt.subplot(2,2,3)
    plt.bar(x=key_A, height=values_A)
    d_left.set_title("Distribution of total labeled comments before \ndeleting non toxic samples", fontsize=22)
    d_left.set_xlabel("Classes", fontsize=15)
    d_left.set_ylabel("Frequency", fontsize=15)
    d_right=plt.subplot(2,2,4)
    plt.bar(x=key_B, height=values_B)
    d_right.set_title("Distribution of total labeled comments after \ndeleting non toxic samples", fontsize=22)
    d_right.set_xlabel("Classes", fontsize=15)
    d_right.set_ylabel("Frequency", fontsize=15)
    

Graph(processed_data,working_data)

In [None]:
working_data=working_data.sample(5000)
working_data=working_data.reset_index()
working_data.drop(columns=['index','non_toxic','classes'], inplace=True)
working_data.head()

In [None]:
#seperating the data into test and train sets
#from sklearn.model_selection import train_test_split

#X=working_data.iloc[:,1]
#y=working_data.iloc[:,2:8]

#X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3)

In [None]:
class lemmaTokenizer(object):
    def __init__(self, stop_words):
        self.stop_words = stop_words
        from nltk.stem import WordNetLemmatizer
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        filtered_tok = []
        for tok in tokens:
            if tok not in stop_words:
                tok = re.sub('\W+','', tok) #Punctuation strip
                tmp = self.lemmatizer.lemmatize(tok)
                if len(tmp) >= 2:
                    filtered_tok.append(tmp)
        return filtered_tok

In [None]:
class stemTokenizer(object):
    def __init__(self, stop_words):
        self.stop_words = stop_words
        from nltk.stem import SnowballStemmer
        self.stemmer = SnowballStemmer(language='english')
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        filtered_tok = []
        for tok in tokens:
            if tok not in stop_words:
                filtered_tok.append(self.stemmer.stem(tok))
        return filtered_tok

In [None]:
class swTokenizer(object):
    def __init__(self, stop_words):
        self.stop_words = stop_words
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        filtered_tok = []
        for tok in tokens:
            if tok not in stop_words:
                filtered_tok.append(tok)
        return filtered_tok

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

#TF-IDF

y = working_data.iloc[:,2:]
X = working_data["comment_text"]
vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,3), stop_words="english", strip_accents="unicode",max_features=5000)
X = vectorizer.fit_transform(X).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC



# Train a logistic regression classifier for each toxicity label
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for label in labels:
    y_train_label = y_train[label]
    y_test_label = y_test[label]
    pipe_steps1 = [("model", SVC())]
    pipe_test1 = Pipeline(steps=pipe_steps1)
    pipe_test1.fit(X_train, y_train_label)
    pipe_test1.score(X_test, y_test_label)
    y_pred1 = pipe_test1.predict(X_test)
    accuracy1 = accuracy_score(y_test_label, y_pred1)
    print(f"Accuracy score of {label}: {accuracy1}")

In [None]:
from sklearn.decomposition import TruncatedSVD


labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for label in labels:
    y_train_label = y_train[label]
    y_test_label = y_test[label]
    svd_tmp2 = TruncatedSVD(n_components=20)
    pipe_steps2 = [("svd", svd_tmp2), ("model", SVC())]
    pipe_test2 = Pipeline(steps=pipe_steps2)
    pipe_test2.fit(X_train, y_train_label)
    pipe_test2.score(X_test, y_test_label)
    y_pred2 = pipe_test2.predict(X_test)
    accuracy2 = accuracy_score(y_test_label, y_pred2)
    print(f"Accuracy score of {label}: {accuracy2}")


In [None]:
from sklearn.ensemble import RandomForestClassifier

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for label in labels:
    y_train_label = y_train[label]
    y_test_label = y_test[label]
    svd_tmp3 = TruncatedSVD(n_components=20)
    pipe_steps3 =[ ('svd', svd_tmp3), ('m', RandomForestClassifier())]
    pipe_test3 = Pipeline(steps=pipe_steps3)
    pipe_test3.fit(X_train, y_train_label)
    pipe_test3.score(X_test, y_test_label)
    y_pred3 = pipe_test3.predict(X_test)
    accuracy3 = accuracy_score(y_test_label, y_pred3)
    print(f"Accuracy score of {label}: {accuracy3}")

In [None]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for label in labels:
    y_train_label = y_train[label]
    y_test_label = y_test[label]
    clf4 = LogisticRegression()
    clf4.fit(X_train, y_train_label)
    y_pred4 = clf4.predict(X_test)
    accuracy4 = accuracy_score(y_test_label, y_pred4)
    print(f"Accuracy score of {label}: {accuracy4}")


In [None]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for label in labels:
    y_train_label = y_train[label]
    y_test_label = y_test[label]
    model6 = LogisticRegression(n_jobs=-1, max_iter=10000)
    params6 = {}
    
    clf6 = GridSearchCV(model6, param_grid=params6, cv=3, n_jobs=-1)
    clf6.fit(X_train, y_train_label)
    y_pred6 = clf6.predict(X_test)
    accuracy6 = accuracy_score(y_test_label, y_pred6)
    print(f"Accuracy score of {label}: {accuracy6}")

In [None]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for label in labels:
    y_train_label = y_train[label]
    y_test_label = y_test[label]
    model7=SVC()
    params7 = {"vect__max_features":[100,500,1000,1500,2000,2500],
            "vect__tokenizer":(swTokenizer(stop_words), stemTokenizer(stop_words), lemmaTokenizer(stop_words) ),
            "vect__norm":["l1","l2"}

    clf = GridSearchCV(estimator  = model7, param_grid = params7, scoring= "balanced_accuracy",
                               cv= 5,n_jobs=-1)
    clf.fit(X_train, y_train_label)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test_label, y_pred)
    print(f"Accuracy score of {label}: {accuracy}")

## Test Data

In [None]:
test_df = pd.read_csv("test.csv")
test_df.head()

## Output Details, Submission Info, and Example Submission

For this project, please output your predictions in a CSV file. The structure of the CSV file should match the structure of the example below. 

The output should contain one row for each row of test data, complete with the columns for ID and each classification.

Into Moodle please submit:
<ul>
<li> Your notebook file(s). I'm not going to run them, just look. 
<li> Your sample submission CSV. This will be evaluated for accuracy against the real labels; only a subset of the predictions will be scored. 
</ul>

It is REALLY, REALLY, REALLY important the the structure of your output matches the specifications. The accuracies will be calculated by a script, and it is expecting a specific format. 

### Sample Evaluator

The file prediction_evaluator.ipynb contains an example scoring function, scoreChecker. This function takes a sumbission and an answer key, loops through, and evaluates the accuracy. You can use this to verify the format of your submission. I'm going to use the same function to evaluate the accuracy of your submission, against the answer key (unless I made some mistake in this counting function).

In [None]:
# Load the trained classifiers
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
classifiers = {}
for label in labels:
    clf = LogisticRegression()
    clf.fit(X_train, y_train[label])
    classifiers[label] = clf
    classifiers[label] = (clf, X_train['id'])

In [None]:
test_df['comment_text'] = test_df['comment_text'].apply(preprocess)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_test = vectorizer.fit_transform(test_df['comment_text']).toarray()

In [None]:
# Predict the toxicity labels for the test data using the trained classifiers
pred_labels = {}
for label in labels:
    clf = classifiers[label]
    pred_labels[label] = clf.predict(X_test)
    classifiers[label] = (clf, X_train['id'])
    

In [None]:
# Store the predicted labels in a file named out.csv
out_df = pd.DataFrame(pred_labels, columns=labels)
out_df.to_csv('Pre2.csv', index=False)

## Grading

The grading for this is split between accuracy and well written code:
<ul>
<li> 75% - Accuracy. The most accurate will get 100% on this, the others will be scaled down from there. 
<li> 25% - Code quality. Can the code be followed and made sense of - i.e. comments, sections, titles. 
</ul>