In [1]:
# Supress warnings

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing required libraries

import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [3]:
# Loading the training data set
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print(df.shape)
df.head()

(1157, 5)


Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
2,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1
3,z13lfzdo5vmdi1cm123te5uz2mqig1brz04,ferleck ferles,2013-11-27T21:39:24,Subscribe to my channel ﻿,1
4,z12avveb4xqiirsix04chxviiljryduwxg0,BeBe Burkey,2013-11-28T16:30:13,and u should.d check my channel and tell me wh...,1


In [4]:
# Checking for null values across columns
df.isnull().sum()

COMMENT_ID      0
AUTHOR          0
DATE          138
CONTENT         0
CLASS           0
dtype: int64

In [5]:
# Null present only in DATE
df[df.isnull().any(axis=1)]

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
693,z12rwfnyyrbsefonb232i5ehdxzkjzjs2,Lisa Wellas,,+447935454150 lovely girl talk to me xxx﻿,1
694,z13vsfqirtavjvu0t22ezrgzyorwxhpf3,Ajkal Khan,,"my sister just received over 6,500 new <a rel=...",1
696,z13xjfr42z3uxdz2223gx5rrzs3dt5hna,Jihad Naser,,Hello I&#39;am from Palastine﻿,1
697,z12zgrw5furdsn0sc233hfwavnznyhicq,kyeman13,,Go check out my rapping video called Four Whee...,1
699,z12gxdortqzwhhqas04cfjrwituzghb5tvk0k,Muhammad Asim Mansha,,Aslamu Lykum... From Pakistan﻿,1
...,...,...,...,...,...
922,LneaDw26bFs2eQd05L_J9wVmiBlCClqLnM0JUQsB-3Q,YungDreams,,Maybe no one will probably read this. But just...,1
930,LneaDw26bFtn6YS8GRfHBxoaZZI5newNd0njNOSeF84,canku alkan,,check out my new EM cover video trailer,1
935,LneaDw26bFvEGhYWZTzIRiff9BXME_JsdLXh4hiJEqo,Daniel Ebrey,,sorry for the spam yall I know it’s annoying. ...,1
937,LneaDw26bFvJWycbUHAiKy7i7L14RqXWenvvcJhwbQE,David Bottenberg,,subscribe to my channel /watch?v=NxK32i0HkDs,1


In [6]:
# COMMENT_ID seems gibberish
df.COMMENT_ID.unique

<bound method Series.unique of 0       LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU
1               z13jhp0bxqncu512g22wvzkasxmvvzjaz04
2               z13fwbwp1oujthgqj04chlngpvzmtt3r3dw
3               z13lfzdo5vmdi1cm123te5uz2mqig1brz04
4               z12avveb4xqiirsix04chxviiljryduwxg0
                           ...                     
1152    _2viQ_Qnc6_RKHVetk9kLzx8ZC62_J7y73FWFSBTe8Q
1153    _2viQ_Qnc68TufyXKiTwky80ewSPbhRiD5XFHrJH9lg
1154    _2viQ_Qnc6-pY-1yR6K2FhmC5i48-WuNx5CumlHLDAI
1155    _2viQ_Qnc6_k_n_Bse9zVhJP8tJReZpo8uM2uZfnzDs
1156    _2viQ_Qnc6_yBt8UGMWyg3vh0PulTqcqyQtdE7d4Fl0
Name: COMMENT_ID, Length: 1157, dtype: object>

In [7]:
# Let's also check the uniqueness of Author
df.AUTHOR.value_counts()

M.E.S                  5
Shadrach Grentz        5
Hidden Love            4
5000palo               4
DanteBTV               4
                      ..
Boshra Abdrhman        1
Mauro Tricolor         1
Grace R                1
sreekrishna praveen    1
Aishlin Maciel         1
Name: AUTHOR, Length: 1094, dtype: int64

In [8]:
''' With reference to above anaylsis, we would need to re-index the comment id, drop columns author and date as out of 1157 data points there are 1094 unique authors which is not generalisable '''

' With reference to above anaylsis, we would need to re-index the comment id, drop columns author and date as out of 1157 data points there are 1094 unique authors which is not generalisable '

In [9]:
# Indexing and dropping AUTHOR and DATE columns
# Train set
df['COMMENT_ID'] = df.index
df.set_index('COMMENT_ID', inplace=True)
df.drop(['AUTHOR','DATE'], axis=1, inplace=True)

# Test set
df_test['COMMENT_ID'] = df_test.index
df_test.set_index('COMMENT_ID', inplace=True)
df_test.drop(['AUTHOR','DATE'], axis=1, inplace=True)

In [10]:
y = df.CLASS

In [11]:
# creating a list of comments and removing all symbols
# train set
comments = df.CONTENT
# converting comments into a list
comments  = [re.sub("[^a-zA-Z0-9\s]","",comment) for comment in comments]

#test set
comments_test  = df_test.CONTENT
# converting comments into a list
comments_test  = [re.sub("[^a-zA-Z0-9\s]","",comment) for comment in comments_test]

In [12]:
stemmer = PorterStemmer()
snowball_stemmed = SnowballStemmer(language='english')

In [13]:
def preprocess(document, stem=True):
    
    # changing comments to lower case
    document = document.lower()
    
    # tokenize into words
    words = word_tokenize(document)
    
    # remove stopwords
    words = [word for word in words if word not in stopwords.words("english")]
    
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [snowball_stemmed.stem(word) for word in words]
    
    document = " ".join(words)
    
    return document

In [14]:
# Performing the preprocessing steps
# train set
comments = [preprocess(comment, stem=False) for comment in comments]

# test set
comments_test = [preprocess(comment, stem=False) for comment in comments_test]

In [15]:
print(comments[:5])

['huh anyway check youtub channel kobyoshi02', 'shake sexi ass channel enjoy', 'watchvvtarggvgtwq check', 'subscrib channel', 'u shouldd check channel tell next']


In [16]:
vectorizer = TfidfVectorizer()
# train
tfidfmodel = vectorizer.fit_transform(comments)
#test
tfidfmodel_test = vectorizer.transform(comments_test)

In [17]:
# Converting data into dataframe
# train
tfidf = pd.DataFrame(tfidfmodel.toarray(), columns=vectorizer.get_feature_names())
tfidf_test = pd.DataFrame(tfidfmodel_test.toarray(), columns=vectorizer.get_feature_names())
tfidf

Unnamed: 0,002,04,10,100,1000,10000000,1000000000,100100,1015,105,...,youtubeqq,youtuberbr,yr,ytma,yuliya,yust,zesti,zip,zombi,zonepacom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
print(vectorizer.get_feature_names())

['002', '04', '10', '100', '1000', '10000000', '1000000000', '100100', '1015', '105', '109', '11', '12', '128gb', '13', '1337', '14', '15', '157', '16', '16gb', '17', '1700000000', '17yr', '18', '1855mm', '1884034783', '19', '1990', '1bi', '1billion', '1hmvtxbr', '1it', '1k', '1m', '1manband', '20', '200', '2004', '2009', '200k', '2010', '2011', '2012', '2012bitch', '2013', '2014', '2015', '2015br', '2017', '205', '21', '2124821694', '2126521750', '23', '25', '250', '25000', '27', '279898', '2b', '2billion', '2i', '2x109', '30', '300', '3000', '3000000', '314', '32gb', '33', '346', '35', '365', '39cuz', '3d', '3m', '3rd', '40', '4000', '4000dollar', '41', '447935454150', '46', '490000', '4gb', '4netjobscom', '4s', '4th', '50', '500', '5000', '500k', '500m', '50k', '5277478', '55200mm', '5800', '5c', '5million', '5s', '5th', '60', '600', '600606', '60inch', '629', '6500', '666', '666002018', '6th', '700000000', '79', '7k', '800', '800m', '851247920br', '857482940', '860000000', '868', '

In [19]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('NB', GaussianNB()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('ABC', AdaBoostClassifier()))
models.append(('GBC', GradientBoostingClassifier()))
models.append(('XGB', XGBClassifier()))

In [20]:
names = []
for name, model in models:
    cv_result = cross_val_score(model, tfidf, y, cv=10, scoring='roc_auc')
    names.append(name)
    msg = "%s : %f (%f)" % (name, cv_result.mean(), cv_result.std())
    print(msg)

LR : 0.944220 (0.050253)
NB : 0.736027 (0.075261)
DT : 0.888869 (0.088461)
RF : 0.909960 (0.085793)
ABC : 0.920304 (0.046801)
GBC : 0.926231 (0.065044)
XGB : 0.945522 (0.039788)


In [22]:
logreg = LogisticRegression()

C_space = [9.2,9.1,9.0,8.9,8.8]
penalty = ['l1','l2']
solvers = ['newton-cg','lbgfs','liblinear']
grid = dict(solver=solvers,penalty=penalty,C=C_space)
cv = StratifiedKFold(n_splits = 10)
grid_search = GridSearchCV(estimator=logreg, param_grid=grid, n_jobs=-1, cv=cv, scoring='roc_auc')
grid_result = grid_search.fit(tfidf,y)
print("Best : %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best : 0.948920 using {'C': 9.0, 'penalty': 'l2', 'solver': 'newton-cg'}


In [23]:
grid_test_result = grid_result.predict(tfidf_test)
grid_test_result

array([1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [24]:
final_df = pd.DataFrame(grid_test_result, columns=['CLASS'])
final_df['ID'] = final_df.index
final_df.set_index('ID', inplace=True)
final_df

Unnamed: 0_level_0,CLASS
ID,Unnamed: 1_level_1
0,1
1,0
2,1
3,0
4,1
...,...
794,1
795,0
796,0
797,0


In [25]:
final_df.to_csv(r'results.csv', index=True)