In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

data = pd.read_csv("Youtube01-Psy.csv")
print(data.sample(5))

                                COMMENT_ID                    AUTHOR  \
44     z13stv3brxe1snv2i225fnvganneudej004            MFkin PRXPHETZ   
215    z124g1oy2wfitb3tw23bw1po2ozhurojq04             LaunchPad Mad   
201  z13qgpqwlqbgj5jqy04cgnjqruv0gpdaino0k                    css403   
186      z133xzqp3ubgyf2ko22osxkrfzvbd5b2y  TheInfectedDoge Gameplay   
324    z135zz0xwx2jetyt523mejn40qifw5hjo04          Jennifer Isaksen   

                    DATE                                            CONTENT  \
44   2014-01-20T09:08:39  if you like raw talent, raw lyrics, straight r...   
215  2014-11-07T18:11:56  Hello! I'm kind of new to Youtube, And soon i'...   
201  2014-11-07T14:25:48                      i am 2,126,492,636 viewer :D﻿   
186  2014-11-07T05:04:28                    most viewed video in the world﻿   
324  2014-11-12T15:12:47  Hahah, juyk! I allways laugh at the part 1:57....   

     CLASS  
44       1  
215      1  
201      0  
186      0  
324      0  


In [2]:
data = data[["CONTENT", "CLASS"]]
print(data.sample(5))

                                               CONTENT  CLASS
182                             OPPA GANGNAM STYLE!!!﻿      0
14   please like :D https://premium.easypromosapp.c...      1
57   Subscribe and like to me for more how to video...      1
8      You should check my channel for Funny VIDEOS!!﻿      1
12                https://twitter.com/GBphotographyGB﻿      1


In [3]:
data["CLASS"] = data["CLASS"].map({0: "Not Spam",
                                   1: "Spam Comment"})
print(data.sample(5))

                                               CONTENT         CLASS
93   Does anyone here use gift cards like Amazon, i...  Spam Comment
134                              ❤️ ❤️ ❤️ ❤️ ❤️❤️❤️❤️﻿      Not Spam
142  pls http://www10.vakinha.com.br/VaquinhaE.aspx...  Spam Comment
1    Hey guys check out my new channel and our firs...  Spam Comment
129  Like getting Gift cards..but hate spending the...  Spam Comment


### Training a Classification Model

In [4]:
x = np.array(data["CONTENT"])
y = np.array(data["CLASS"])

cv = CountVectorizer()
x = cv.fit_transform(x)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, 
                                                test_size=0.2, 
                                                random_state=42)

model = BernoulliNB()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))

0.9857142857142858


### Now let’s test the model by giving spam and not spam comments as input:

In [5]:
sample = "Check this out: https://thecleverprogrammer.com/" 
data = cv.transform([sample]).toarray()
print(model.predict(data))

['Spam Comment']


In [6]:
sample = "Lack of information!" 
data = cv.transform([sample]).toarray()
print(model.predict(data)) 

['Not Spam']
