# YouTube Comments Spam Classification

### Importing Required Libraries

In [8]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB

## Reading the dataset

In [2]:
data = pd.read_csv("./Youtube01-Psy.csv")

In [3]:
data.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1


## Filtering the data for the desired columns

We do not need the `comment_id`, `author` and `date` inorder to classify the comment as spam or not. Hence we can proceed with removing them from the dataset, as it may complicate our model training process due it's irrelevance.

In [6]:
data = data[["CONTENT", "CLASS"]]
data.sample(5)

Unnamed: 0,CONTENT,CLASS
65,"969,210 dislikes like dislike themselves﻿",0
138,https://www.tsu.co/KodysMan plz ^^﻿,1
8,You should check my channel for Funny VIDEOS!!﻿,1
264,If you pause at 1:39 at the last millisecond y...,0
295,2 billion views wow not even baby by justin be...,0


### Classifying into Spam/Not Spam

In [7]:
data["CLASS"] = data["CLASS"].map({0: "Not Spam",
                                   1: "Spam"})

data.sample(5)

Unnamed: 0,CONTENT,CLASS
161,Incmedia.org where the truth meets you.﻿,Spam
190,Have you tried a new social network TSU? This ...,Spam
238,2:05. Hahahahah ﻿,Not Spam
248,"Why the fuck this keeps updated? Comments :""5 ...",Not Spam
59,Subscribe ME!﻿,Spam


## Transforming and Splitting the Data

In [19]:
X = np.array(data["CONTENT"])
y = np.array(data["CLASS"])

cv = CountVectorizer()

X = cv.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training the Model

In [20]:
model = BernoulliNB()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.9857142857142858


## Model Evaluation

In [28]:
def check_spam(sample):
    data = cv.transform([sample]).toarray()
    print(model.predict(data))

In [29]:
sample = "Reach me for free content" 
check_spam(sample)

['Spam']


In [32]:
sample = "Check out for the amazing resources on Machine Learning"
check_spam(sample)

['Not Spam']
