In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /Users/zdravko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zdravko/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/zdravko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [51]:
data = pd.read_csv('comments.csv', sep=',', header=0)
data.head()

data[data['Is Reply'] == True][6600:6610]

Unnamed: 0,Comments,Comment ID,Reply Count,Is Reply,Author,AuthorID
13040,👆👆Congratulation you have been selected among ...,UgwZdOmybEXnE59gWDJ4AaABAg.9a5phea4RK-9a5pje4ibaj,0,True,Telegram me@mrbeast453,UCpd3kplR9EdsvjbCAtp1JPw
13042,This real?,UgyplM0Mxw9MGOaTufB4AaABAg.9a5pQPe__Yg9a5r_cjVepc,0,True,LuchoXDD,UCu7MB0zqeYeC1nbqwJpT9sQ
13043,Thanks for watching and commenting send a dire...,UgyplM0Mxw9MGOaTufB4AaABAg.9a5pQPe__Yg9a5pfEbP4cd,0,True,WhatsApp👉+①⑦①④⑤⑧⑦⑤⑥⑦⑦,UCLlW4UJkWDqW9Ht4u8LRztw
13044,🆙Congratulations! You've won the iPhone 13 pro...,UgyplM0Mxw9MGOaTufB4AaABAg.9a5pQPe__Yg9a5pROf4bhJ,0,True,Telegram Me 🔗IamMattJones,UClLJBDGFBNEvfuqO7mnISDA
13046,Hay bro,Ugx9BcyffyeUY04bSZV4AaABAg.9a5pPwGaBsn9a6Iw7cAVSq,0,True,Alpixi,UCyVbB_mQsLtyDsXx7hWjqyg
13047,Soy un boi este no es mi cueta,Ugx9BcyffyeUY04bSZV4AaABAg.9a5pPwGaBsn9a5r_EO0WI3,0,True,Alpixi,UCyVbB_mQsLtyDsXx7hWjqyg
13048,envíame un mensaje ahora mismo,Ugx9BcyffyeUY04bSZV4AaABAg.9a5pPwGaBsn9a5py8YfWe0,0,True,WhatsApp👉+①⑦①④⑤⑧⑦⑤⑥⑦⑦,UCblmGPQJ1_O_gBMmGRiVM9Q
13049,Soy de Ecuador,Ugx9BcyffyeUY04bSZV4AaABAg.9a5pPwGaBsn9a5pl-mh0ZR,0,True,Alpixi,UCyVbB_mQsLtyDsXx7hWjqyg
13050,Sabe a lar ingles,Ugx9BcyffyeUY04bSZV4AaABAg.9a5pPwGaBsn9a5ph9T8lwS,0,True,Alpixi,UCyVbB_mQsLtyDsXx7hWjqyg
13052,No hablo ingles no entiendo el comentario,UgwjMvpoD-z6UFvKd7R4AaABAg.9a5pGzGQPfP9a5qsh45tU2,0,True,Super drenx,UCde_fu2FRIZLQwrtzlD69sg


In [53]:
scamChannels = ['UCwgMHGG8IDcYjjSXtYbE9rQ', 'UC8Oy99fOvCjHfbvTImDXpkg',
'UCswOElw6g7pEeAA5Mu15p3Q', 'UCIknJ8HTOMLSfIAmtsm84vA', 'UCsXIMkerN0ofYQVIvNWr7Dg', 'UC5iFMKp-Tuf2RX8wTDIA00w',
'UC4BTXtDeOzz85XMShFWY1VA', 'UCIIab0_13sTLxQY66QjpM-g', 'UCZIIrgNdsq0MPdbG6utOLmw', 'UCf3mY9fz0oesuFS4fUnhjsg',
'UCii92DZYgGqYquT71D9cWSQ', 'UCSkoxrUobYERf5FI1F0KDTg', 'UCbdM2ysSVcPxHghp50tiPQw', 'UC1Wi-CaZ_u111EET3es-jzA',
'UCJB7ZvEbo-xRZum_3Zmq9sw', 'UCe5DcGeti6adyFedf49MRuA', 'UCfOYoX3Cwcn0A8t32c3cM5g', 'UCIAYvaVP15Cel4NZ1ib_ADA',
'UC8Oy99fOvCjHfbvTImDXpkg', 'UCgvlSYolCHq5JiiosIvM6lw', 'UCcxxhVKa9wVvHXscwMXC2gQ', 'UCs5NZ-lIbR_rvYDaKvBrVFw',
'UCHCtAgNRSrcBMRqU5SfHplw', 'UC07gLBHCOrcOyE0bw8Lv8Jg', 'UCnOW6JnwbE46hKOPyDYxc_w', 'UCLlW4UJkWDqW9Ht4u8LRztw',
'UCoPBjpeflrKudIDsvDEV0aw', 'UCcoKvgQgWLXosFP5giUeI4g', 'UChYGnlCGDagZT0F8GYhDY2w', 'UCapA77PmpW9dEZQxxJY7TIg',
'UCfdoVOJ9krvIZReKoLWAelA', 'UCU5WWthBfCSYPt-YRNRZYPQ', 'UCDDbjiG_3PHqn4_8BRMr9yA', 'UCVnjpK8HoaeYGQCJUjtiq6g',
'UCkJePyY8lKMvWSF9S-X_hpQ', 'UCcoKvgQgWLXosFP5giUeI4g', 'UCcoKvgQgWLXosFP5giUeI4g', 'UCwRfjTiJGrONLlNHFQ0mQ2A',
'UCJoDuLaZaOmmTLk9IVst6OA', 'UCk4bBXoqPm8j2FjyStas7jQ', 'UC6M_UOk3D8ZG0J2HmuK6J7g', 'UCkJePyY8lKMvWSF9S-X_hpQ',
'UCk4bBXoqPm8j2FjyStas7jQ', 'UCgHM7Mjvat503rAX9jlFf3w', 'UCzywEh2M-ReJTvlPh-jA1-w', 'UC1Wi-CaZ_u111EET3es-jzA',
'UClmn1s3aSJcqsGoQG7ZV5rg', 'UCIAYvaVP15Cel4NZ1ib_ADA', 'UC8Oy99fOvCjHfbvTImDXpkg', 'UCcxxhVKa9wVvHXscwMXC2gQ', 
'UCHCtAgNRSrcBMRqU5SfHplw', 'UC07gLBHCOrcOyE0bw8Lv8Jg', 'UCHCtAgNRSrcBMRqU5SfHplw', 'UCk4bBXoqPm8j2FjyStas7jQ', 
'UClmXrUcfU0A3BWSCU1XouNg', 'UCDhBrg4uKxdc0u5mv2x9GAA', 'UCwgMHGG8IDcYjjSXtYbE9rQ', 'UChYGnlCGDagZT0F8GYhDY2w', 
'UCcoKvgQgWLXosFP5giUeI4g', 'UC4A5Ov8NNZ8K1Lq5cfcESUg', 'UC7qpkq26VEDSe6H3ZwTMWJg', 'UCgvlSYolCHq5JiiosIvM6lw',
'UCk44Jx55IZkD57On49NFtiw', 'UCPDuYbsUJxNexaCBruG-jkw', 'UCOqxwnL0H8oCTHlBNO9uSGQ', 'UCnOW6JnwbE46hKOPyDYxc_w', 
'UCblmGPQJ1_O_gBMmGRiVM9Q', 'UC4IerKEP2BDFd2VVXLaeU7w', 'UCVnjpK8HoaeYGQCJUjtiq6g', 'UChoTld7IoAinX5E7nYTZJVg',
'UCwgMHGG8IDcYjjSXtYbE9rQ', 'UC4IerKEP2BDFd2VVXLaeU7w', 'UCYbUqEvxVhVh71wBbrgvyGA', 'UCEmzN-PVaElRDmdlBFbMW1A', 
'UCNBOZg8v0cFe2RYvSgJ_K3A', 'UCpTKksadLBwHiFOdtxVq5Sg', 'UCkGvXvq42A152M8S5xOMbBw', 'UC4kPR2QLYHmURMMr5ERPAcA', 
'UCJ4a59zmaTKVThYBU-ZZdFA', 'UCV_Qh1S9gCCXm2XBQRy0v8Q', 'UCsXIMkerN0ofYQVIvNWr7Dg', 'UCgP6tTK8LrlhK7LMhExs6eQ', 
'UCRqHZqWr81nXW0Qr2MVcD-g', 'UCpTKksadLBwHiFOdtxVq5Sg', 'UC1Wi-CaZ_u111EET3es-jzA', 'UCfH5_gBNYMszkN-6P1SAC1Q', 
'UCfH5_gBNYMszkN-6P1SAC1Q', 'UC8Oy99fOvCjHfbvTImDXpkg', 'UCYGbbdOfUAtBOFoiifA4cpw', 'UCFSHMdKANs7Ee_xAfvfX7ug', 
'UChYuxekYJM1HXY6ThthPR8A', 'UCXLbuE7gWN6CVyliQHKP62w', 'UC1Wi-CaZ_u111EET3es-jzA', 'UCYGbbdOfUAtBOFoiifA4cpw',
'UCpd3kplR9EdsvjbCAtp1JPw', 'UCOLzBAcpeCiue5PnlPSvczQ', 'UCpd3kplR9EdsvjbCAtp1JPw', 'UCMhUkjPmAi9xrbskMrdNPWA',
'UCLlW4UJkWDqW9Ht4u8LRztw'
]


def isScam(id):

    if  id in scamChannels:
        return 1
    else:
        return 0


data['Is Scam'] = data['AuthorID'].apply(lambda x: isScam(x))

train = data[0:30000].copy()[['Comments', 'Is Reply', 'Author', 'Is Scam']]
test = data[30001:35001].copy()[['Comments', 'Is Reply', 'Author']]

In [54]:
x = train['Is Scam'].value_counts()
x

0    22979
1     7021
Name: Is Scam, dtype: int64

In [None]:
train['Word Count'] = train['Comments'].apply(lambda x: len(str(x).split()))
print(train[train['Is Scam']==1]['Word Count'].mean())
print(train[train['Is Scam']==0]['Word Count'].mean())

In [None]:
# PLOTTING WORD-COUNT
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,4))
train_words=train[train['Is Scam']==1]['Word Count']
ax1.hist(train_words,color='red')
ax1.set_title('Scam Comments')
train_words=train[train['Is Scam']==0]['Word Count']
ax2.hist(train_words,color='green')
ax2.set_title('Non-scam Comments')
fig.suptitle('Words per comment')
plt.show()

In [None]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = str(text).lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
train['Clean Comment'] = train['Comments'].apply(lambda x: finalpreprocess(x))

In [None]:
train.head()

In [None]:
train['Clean text tokens']=[nltk.word_tokenize(i) for i in train['Clean Comment']]
model = Word2Vec(train['Clean text tokens'],min_count=1)     
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))

X_train, X_test, y_train, y_test = train_test_split(train['Clean Comment'], train['Is Scam'], test_size=0.4, shuffle=True)

X_train_tok = [nltk.word_tokenize(i) for i in X_train]
X_test_tok = [nltk.word_tokenize(i) for i in X_test]

In [None]:
#Tf-Idf
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

#building Word2Vec model
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])



modelw = MeanEmbeddingVectorizer(w2v)

# converting text to numerical data using Word2Vec
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_test_vectors_w2v = modelw.transform(X_test_tok)

In [None]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  #model

y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]

print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

In [None]:
#Pre-processing the new dataset
test['Clean Comments'] = test['Comments'].apply(lambda x: finalpreprocess(x)) #preprocess the data
X_test=test['Clean Comments']
#converting words to numerical data using tf-idf
X_vector=tfidf_vectorizer.transform(X_test)
#use the best model to predict 'target' value for the new dataset 
y_predict = lr_tfidf.predict(X_vector)      
y_prob = lr_tfidf.predict_proba(X_vector)[:,1]

test['predict_prob']= y_prob
test['Is Scam']= y_predict
final=test[['Clean Comments', 'predict_prob', 'Is Scam']].reset_index(drop=True)

In [None]:
final