In [29]:
import pandas as pd
import eli5
import re
import numpy as np
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_curve, recall_score, f1_score

# 1. Import and clean the dataset

In [2]:
#import dataset, remove the 'no match' rows
song_data = pd.read_csv("subsongdata_57650.csv")
song_data = song_data[['artist','song','text','explicit_label']]
song_data = song_data.loc[song_data['explicit_label'] != 'no match']
#remove'\n' from the lyrics
re_drop = re.compile(r'\n')        
song_data[['text']] = song_data[['text']].applymap(lambda x:re_drop.sub(' ',x))

song_data

Unnamed: 0,artist,song,text,explicit_label
1,ABBA,"Andante, Andante","Take it easy with me, please Touch me gently...",False
2,ABBA,As Good As New,I'll never know why I had to go Why I had to...,False
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,False
7,ABBA,Chiquitita,"Chiquitita, tell me what's wrong You're ench...",False
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o...",False
12,ABBA,Disillusion,"Changing, moving in a circle I can see your ...",False
13,ABBA,Does Your Mother Know,"You're so hot, teasing me So you're blue but...",False
15,ABBA,Dum Dum Diddle,"I can hear how you work, practising hard Pla...",False
18,ABBA,Fernando,Can you hear the drums Fernando? I remember ...,False
21,ABBA,From A Twinkling Star To A Passing Angel,"Twinkle, Twinkle little star How I wonder wh...",False


# 2. Split into training dataset and test dataset

In [4]:
#extract all the rows with explicit_label = True
song_data_1 = song_data.loc[song_data['explicit_label'] == 'True']
#song_data_1

#ramdomly extract 4068 rows with explicit_label = False, which is 3 times as many as song_data_1
song_data_0 = song_data.loc[song_data['explicit_label'] == 'False']
song_data_0 = song_data_0.sample(n=4068, replace=False, random_state=100)
#song_data_0

#ramdomly extract 1356 data as test set, leaving 4068 as training data
x = song_data_0[['artist','song','text']].append(song_data_1[['artist','song','text']])
y = song_data_0[['explicit_label']].append(song_data_1[['explicit_label']])
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=1356, random_state=100)
#y_train.loc[y_train['explicit_label']=='True']

In [6]:
train_label = []
for i in range(len(y_train)):
    l = y_train.iloc[i,0]
    if l=="False":
        l = 0
    else:
        l = 1
    train_label.append(l)

test_label = []
for i in range(len(y_test)):
    l = y_test.iloc[i,0]
    if l=="False":
        l = 0
    else:
        l = 1
    test_label.append(l)
    
train_data = []
for i in range(len(x_train)):
    text = x_train.iloc[i,2]
    train_data.append(text)
    
test_data = []
for i in range(len(x_test)):
    text = x_test.iloc[i,2]
    test_data.append(text)

# 4. Customized features

## Import bad words list

In [7]:
file = open('abbo.txt','r')
file = list(file)
bad_words = []
for w in file:
    bad_words.append(re.sub(r'\n','',w))

## Create customized features

In [8]:
class CustomFeats(BaseEstimator, TransformerMixin):
    def __init__(self):
      self.feat_names = set()

    def fit(self, x, y=None):
        return self

    @staticmethod
    def features(review):
      return {
          'num_word': get_num_words(review),
          'bad_word': get_bad_words(review),
          'lda_word': get_lda_words(review)
      }

    def get_feature_names(self):
        return list(self.feat_names)
      
    def transform(self, reviews):
      feats = []
      for review in reviews:
        f = self.features(review)
        [self.feat_names.add(k) for k in f] 
        feats.append(f)
      return feats

#feats = make_pipeline(CustomFeats(), DictVectorizer())
feats = FeatureUnion([
     ('custom', make_pipeline(CustomFeats(), DictVectorizer())),
     ('bag_of_words', TfidfVectorizer(stop_words='english'))
 ])

In [9]:
def get_bad_words(review):
  target_word = bad_words
  count = 0
  threshold = 0
  for t in target_word:
        if review.find(t) != -1:
            count += 1
  return count > threshold

def get_num_words(review):
  threshold = 0
  words = review.split(' ')
  count = len(list(words))
  return count > threshold

def get_lda_words(review):
  target_word = ['chorus','girl','money','baby','nigga','bitch','want','love','wanna','gonna','come','right','shit','feel']
  count = 0
  threshold = 0
  for t in target_word:
        if review.find(t) != -1:
            count += 1
  return count > threshold

# 5. Modelling

In [10]:
def classification(feats, model):  
  train_vecs = feats.fit_transform(train_data)
  test_vecs = feats.transform(test_data)
    
  model.fit(train_vecs, train_label)

  train_preds = model.predict(train_vecs)
  train_f1 = f1_score(train_label, train_preds, average='micro')

  test_preds = model.predict(test_vecs)
  test_f1 = f1_score(test_label, test_preds, average='micro')

  cm = confusion_matrix(test_label, test_preds)
  print("Confusion Matrix : \n", cm, " \n")

  report = classification_report(test_label, test_preds)
  print(report)

  return train_f1,test_f1

## Parameters tunning with grid search

In [66]:
train_vecs = feats.fit_transform(train_data)
test_vecs = feats.transform(test_data)
param_test = {'max_depth':list(range(60,80,1))} #
gsearch = GridSearchCV(estimator = DecisionTreeClassifier(),
                       param_grid = param_test, scoring='recall', iid=False, cv=5)
gsearch.fit(train_vecs, train_label)
gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_

([mean: 0.75898, std: 0.02110, params: {'max_depth': 60},
  mean: 0.75510, std: 0.03388, params: {'max_depth': 61},
  mean: 0.75413, std: 0.02093, params: {'max_depth': 62},
  mean: 0.74733, std: 0.02037, params: {'max_depth': 63},
  mean: 0.74442, std: 0.01816, params: {'max_depth': 64},
  mean: 0.75219, std: 0.02236, params: {'max_depth': 65},
  mean: 0.74150, std: 0.01547, params: {'max_depth': 66},
  mean: 0.74440, std: 0.02797, params: {'max_depth': 67},
  mean: 0.74830, std: 0.02964, params: {'max_depth': 68},
  mean: 0.74345, std: 0.02489, params: {'max_depth': 69},
  mean: 0.75508, std: 0.02938, params: {'max_depth': 70},
  mean: 0.73859, std: 0.03331, params: {'max_depth': 71},
  mean: 0.75119, std: 0.02418, params: {'max_depth': 72},
  mean: 0.74926, std: 0.02127, params: {'max_depth': 73},
  mean: 0.75023, std: 0.02864, params: {'max_depth': 74},
  mean: 0.75508, std: 0.02027, params: {'max_depth': 75},
  mean: 0.74828, std: 0.02634, params: {'max_depth': 76},
  mean: 0.7424

## Logistic Regression

In [23]:
model_lo= LogisticRegression(C=50)
classification(feats, model_lo)

Confusion Matrix : 
 [[999  30]
 [119 208]]  

             precision    recall  f1-score   support

          0       0.89      0.97      0.93      1029
          1       0.87      0.64      0.74       327

avg / total       0.89      0.89      0.88      1356



(0.9997541789577188, 0.890117994100295)

## Random Forest

In [17]:
model_rf = RandomForestClassifier(n_estimators=110, max_depth=140, min_samples_split=30)
classification(feats, model_rf)

Confusion Matrix : 
 [[1011   18]
 [ 118  209]]  

             precision    recall  f1-score   support

          0       0.90      0.98      0.94      1029
          1       0.92      0.64      0.75       327

avg / total       0.90      0.90      0.89      1356



(0.987708947885939, 0.8997050147492626)

## KNN

In [27]:
model_knn= KNeighborsClassifier(n_neighbors=10) 
classification(feats, model_knn)

Confusion Matrix : 
 [[1014   15]
 [ 206  121]]  

             precision    recall  f1-score   support

          0       0.83      0.99      0.90      1029
          1       0.89      0.37      0.52       327

avg / total       0.85      0.84      0.81      1356



(0.8502949852507374, 0.8370206489675516)

## Decision Tree

In [64]:
model_dt = DecisionTreeClassifier(min_samples_split=0.4, max_depth=77)
classification(feats, model_dt)

Confusion Matrix : 
 [[944  85]
 [ 57 270]]  

             precision    recall  f1-score   support

          0       0.94      0.92      0.93      1029
          1       0.76      0.83      0.79       327

avg / total       0.90      0.90      0.90      1356



(0.9434611602753196, 0.8952802359882006)

## SVM

In [58]:
model_svm = SVC(C = 10000, kernel = 'rbf')
classification(feats, model_svm)

Confusion Matrix : 
 [[1012   17]
 [ 142  185]]  

             precision    recall  f1-score   support

          0       0.88      0.98      0.93      1029
          1       0.92      0.57      0.70       327

avg / total       0.89      0.88      0.87      1356



(0.9397738446411013, 0.8827433628318584)