Importing necessary libraries

In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [74]:
df = pd.read_csv('blogtext.csv')
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [75]:
print('Number of rows is:',df.shape[0])
print('Number of colums is:',df.shape[1])

Number of rows is: 681284
Number of colums is: 7


Since there are huge number of rows, we are selecting a sample of 5000 rows since the computational power of our laptops are very less

In [76]:
df = df.head(5000)
print('Number of rows is:',df.shape[0])
print('Number of colums is:',df.shape[1])

Number of rows is: 5000
Number of colums is: 7


# Data Preprocessing 

Importing necessary packages for data preprocessing

In [77]:
import re
from nltk.corpus import stopwords

In [78]:
df['text'] = df['text'].apply (lambda x : re.sub("[^A-Za-z]+"," ",x))
df['text'] = df['text'].apply (lambda x : x.strip())
stopwords = set(stopwords.words("english"))
df["text"] = df["text"].apply(lambda x : " ".join([word for word in x.split() if word not in stopwords]))

In [79]:
# Viewing a sample to check if the data is cleaned and if we have removed any special characters in the dataset
df.text[9]

'I surf English news sites lot looking tidbits Korea foreigners like view Hermit Kingdom also way keep fast moving place Sometimes though one needs check veracity figures put papers especially local ones Here two examples English version Korea Times JoongAng Ilbo Daily The first pretty straightforward urlLink Korea Times said people arrested forging Korean passports urlLink JoongAng Ilbo says accused Huh Another one urlLink JoongAng Ilbo said S P positive Korean banks good thing urlLink Korea Times said S P tad worried bad loans banks extended small medium sized firms I idea simple facts seem presented differently simply translation'

In [80]:
df["label"] = df.apply(lambda row : [row["gender"],str(row["age"]),row["topic"],row["sign"]],axis =1)

In [81]:
df = df[["text","label"]]

In [82]:
df.head()

Unnamed: 0,text,label
0,Info found pages MB pdf files Now wait untill ...,"[male, 15, Student, Leo]"
1,These team members Drewes van der Laag urlLink...,"[male, 15, Student, Leo]"
2,In het kader van kernfusie op aarde MAAK JE EI...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,Thanks Yahoo Toolbar I capture URLs popups mea...,"[male, 33, InvestmentBanking, Aquarius]"


# Spliting the dataset into Train and Test 

In [83]:
from sklearn.model_selection import train_test_split

In [84]:
x = df["text"]
y = df["label"]

In [85]:
xtrain,xtest,ytrain,ytest = train_test_split(x.values,y.values, test_size = .30, random_state=42)
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(3500,)
(1500,)
(3500,)
(1500,)


After the train test split we are converting the data into vectors, Since machine understands only 0's and 1's we are converting the textual data into vectors.

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [87]:
tfid = CountVectorizer(binary=True,ngram_range=(1,2))
xtrain_tf = tfid.fit_transform(xtrain)

In [88]:
xtrain_tf.toarray()[1]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [89]:
tfid.get_feature_names()[:15]

['aa',
 'aa amazing',
 'aa compared',
 'aaa',
 'aaa come',
 'aaa someone',
 'aaa tow',
 'aaaaaaah',
 'aaaaaaah fade',
 'aaaaack',
 'aaaah',
 'aaaah wisdom',
 'aaaahh',
 'aaaarrrgghhhh',
 'aaaarrrgghhhh slightly']

In [90]:
xtest_tf = tfid.transform(xtest)

In [91]:
label_count = {}
for j in y:
    for i in j:
        if i in label_count:
            label_count[i] += 1
        else:
            label_count[i] = 1 

In [92]:
label_count

{'male': 3294,
 '15': 339,
 'Student': 569,
 'Leo': 190,
 '33': 101,
 'InvestmentBanking': 70,
 'Aquarius': 329,
 'female': 1706,
 '14': 170,
 'indUnk': 1381,
 'Aries': 2483,
 '25': 268,
 'Capricorn': 84,
 '17': 331,
 'Gemini': 86,
 '23': 137,
 'Non-Profit': 47,
 'Cancer': 94,
 'Banking': 16,
 '37': 19,
 'Sagittarius': 704,
 '26': 96,
 '24': 353,
 'Scorpio': 408,
 '27': 86,
 'Education': 118,
 '45': 14,
 'Engineering': 119,
 'Libra': 414,
 'Science': 33,
 '34': 540,
 '41': 14,
 'Communications-Media': 61,
 'BusinessServices': 87,
 'Sports-Recreation': 75,
 'Virgo': 41,
 'Taurus': 100,
 'Arts': 31,
 'Pisces': 67,
 '44': 3,
 '16': 67,
 'Internet': 20,
 'Museums-Libraries': 2,
 'Accounting': 2,
 '39': 79,
 '35': 2307,
 'Technology': 2332,
 '36': 60,
 'Law': 3,
 '46': 7,
 'Consulting': 16,
 'Automotive': 14,
 '42': 9,
 'Religion': 4}

# Model Building

In [93]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer

In [94]:
mlb = MultiLabelBinarizer(classes=sorted(label_count.keys()))
y_train = mlb.fit_transform(ytrain)
y_test = mlb.fit_transform(ytest)

In [95]:
y_train

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [96]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [97]:
lr = LogisticRegression(solver='lbfgs')
ovs = OneVsRestClassifier(lr)

In [98]:
ovs.fit(xtrain_tf,y_train)

OneVsRestClassifier(estimator=LogisticRegression())

In [99]:
predicted_label = ovs.predict(xtest_tf)
predicted_score = ovs.decision_function(xtest_tf)

In [100]:
inverse_pred = mlb.inverse_transform(predicted_label)
inverse_ytest = mlb.inverse_transform(y_test)

In [101]:
for i in range(5):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        xtest[i],
        ','.join(inverse_ytest[i]),
        ','.join(inverse_pred[i])
    ))

Title:	beautiful
True labels:	35,Aries,Technology,male
Predicted labels:	35,Aries,Technology,male


Title:	good idea johnathan quite tasty
True labels:	35,Aries,Technology,male
Predicted labels:	35,Aries,Technology,male


Title:	I house right
True labels:	35,Aries,Technology,male
Predicted labels:	35,Aries,Technology,male


Title:	Sorry Im person I If I could I wouldnt Too Bad Already I went Harry today Then went sledding quiet awhile We ran Tom Scott went sledding Then got cold hungry decided head back Then goofed ate dinner I realized I needed along time ago Why I way I Why I one stuck depressed time I odd one whole vegetarian scheme If I normal like every ska bunctious kid maybe I wouldnt problem Thats I A problem While im deep thought im going go listen Early November think stuff Nights long Hope never come
True labels:	15,Libra,Student,female
Predicted labels:	female


Title:	There baby named Bo Jangles When born parents told blind Bo blind The doctors foreign believed blind reall

In [102]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(y_val, predicted):
    print('Accuracy score: ', accuracy_score(y_val, predicted))
    print('F1 score: ', f1_score(y_val, predicted, average='micro'))
    print('Average precision score: ', average_precision_score(y_val, predicted, average='micro'))
    print('Average recall score: ', recall_score(y_val, predicted, average='micro'))

In [103]:
print('Bag-of-words')
print_evaluation_scores(y_test, predicted_label)

Bag-of-words
Accuracy score:  0.5013333333333333
F1 score:  0.7203310137295468
Average precision score:  0.5543724857068261
Average recall score:  0.6383333333333333
