In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [2]:
df = pd.read_csv('blogtext.csv')

In [3]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [4]:
df.shape

(681284, 7)

The dataset is too large, we will be reducing the dataset for faster computation

taking a random sample of 3400 rows

In [7]:
df_new = df.sample(frac=0.005)

In [8]:
df_new.shape

(3406, 7)

Data pre-processing

In [9]:
df_text = df_new[['text']]
df_text = df_text.applymap(lambda x: re.sub(r"[^0-9A-Za-z ]+","",str(x)))

In [10]:
df_text.head()

Unnamed: 0,text
343424,1 What food do you like that most peopl...
295263,Mood Content Music Phone ringing ...
385392,ORGANIZING I think I need to u...
646772,It was soo weird seeing Canada 72nd on ...
355964,Yesterday moved quickly I spoke to ...


Converting gender, age, topic and sign as one column - labels

In [11]:
df_labels = df_new['gender'].astype(str)+ ", "+df_new['age'].astype(str)+ ", "+df_new['topic'].astype(str)+", "+df_new['sign'].astype(str)

In [12]:
df_labels = pd.DataFrame(df_labels, columns = ['labels'])

In [13]:
df_final = df_text.join(df_labels)

In [14]:
df_final.head()

Unnamed: 0,text,labels
343424,1 What food do you like that most peopl...,"male, 26, Fashion, Cancer"
295263,Mood Content Music Phone ringing ...,"female, 23, Advertising, Cancer"
385392,ORGANIZING I think I need to u...,"female, 26, indUnk, Taurus"
646772,It was soo weird seeing Canada 72nd on ...,"male, 16, Student, Pisces"
355964,Yesterday moved quickly I spoke to ...,"male, 33, indUnk, Taurus"


In [15]:
df_final.shape

(3406, 2)

In [18]:
genderCount = df_new['gender'].value_counts().to_dict()
ageCount = df_new['age'].value_counts().to_dict()
topicCount = df_new['topic'].value_counts().to_dict()
signCount = df_new['sign'].value_counts().to_dict()

In [19]:
label_counts = {**genderCount , **ageCount, **topicCount, **signCount}

In [20]:
label_counts

{'female': 1721,
 'male': 1685,
 17: 429,
 24: 393,
 23: 370,
 25: 364,
 16: 361,
 26: 269,
 27: 226,
 15: 206,
 14: 131,
 34: 110,
 33: 88,
 35: 79,
 13: 64,
 36: 63,
 37: 48,
 38: 33,
 40: 28,
 39: 21,
 43: 21,
 41: 20,
 45: 19,
 47: 17,
 48: 15,
 42: 15,
 46: 9,
 44: 7,
 'indUnk': 1243,
 'Student': 798,
 'Technology': 220,
 'Arts': 168,
 'Education': 145,
 'Communications-Media': 95,
 'Non-Profit': 73,
 'Internet': 69,
 'Engineering': 64,
 'Law': 47,
 'Publishing': 41,
 'Science': 35,
 'Consulting': 32,
 'Government': 30,
 'BusinessServices': 28,
 'Advertising': 24,
 'Marketing': 23,
 'Religion': 23,
 'Accounting': 22,
 'Fashion': 22,
 'HumanResources': 22,
 'Sports-Recreation': 20,
 'Chemicals': 19,
 'Banking': 18,
 'RealEstate': 15,
 'Military': 14,
 'Telecommunications': 12,
 'Manufacturing': 12,
 'Museums-Libraries': 9,
 'Tourism': 8,
 'Transportation': 8,
 'LawEnforcement-Security': 8,
 'Architecture': 8,
 'Biotech': 7,
 'Construction': 6,
 'Automotive': 6,
 'Agriculture': 4,
 

Seperating as features and labels

In [21]:
X = df_final['text']
y = df_final['labels']

Splitting into train and test

In [22]:
X_train, X_test, y_train, y_test= train_test_split(X,y, random_state=42, test_size = 0.2)

Vectorizing the features

In [23]:
vect = CountVectorizer(ngram_range=(1,2), stop_words='english', min_df=2)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [24]:
print(vect.get_feature_names()[-15:])

['zeal', 'zealand', 'zen', 'zero', 'zillion', 'zines', 'zip', 'zodiac', 'zoe', 'zombie', 'zombies', 'zone', 'zones', 'zoo', 'zoo visit']


Transforming the labels using MultiLabelBinarizer


In [25]:
y_train_new = [set(i.split(',')) for i in y_train]
y_test_new = [set(i.split(',')) for i in y_test]

In [26]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train_new)
y_test = mlb.transform(y_test_new)

Using One Vs Rest classifier

In [27]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1, solver='lbfgs', max_iter = 1000000)
model = OneVsRestClassifier(model)
model.fit(X_train_dtm, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None,
                                                 max_iter=1000000,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [29]:
y_pred = model.predict(X_test_dtm)

Accuracy score:

In [53]:
metrics.accuracy_score(y_test, y_pred)

0.002932551319648094

F1 score:

In [54]:
metrics.f1_score(y_test, y_pred, average = 'samples')

0.267867616254713

Precision score:

In [55]:
metrics.precision_score(y_test, y_pred, average='samples')

0.5035679374389052

Recall score:

In [56]:
metrics.recall_score(y_test, y_pred, average = 'samples')

0.1935483870967742

In [63]:
y_test_pred_inversed = mlb.inverse_transform(y_pred_class)
y_test_inversed = mlb.inverse_transform(y_test)
for i in range(350,355):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test.iloc[i],
        ','.join(y_test_inversed[i]),
        ','.join(y_test_pred_inversed[i])
    ))

Title:	       Somehow I neglected to mention the phone call I got a couple of weeks ago just after Id begun to see what a jackass my commander is It was from the FBI I filled out an online background form and the recruiter in Albany called to arrange for me to take the Special Agents Exam Which is June 23 at least a week after we leave  So as soon as we get back Ill finish that process She said theyd already done a preliminary background check based on the internet thing and that my military and intelligence experience make me more or less a shooin  Im already making plans for dropping my paperwork Its best to do it from over there when there will be too many distractions for anyone to take much notice If I drop the separation paperwork now it will earn me a great deal of animosity   But you know after last week I dont know that I  care  What are you gonna do to me stamp my mealcard No Dessert Send me to Iraq   Maybe this deployment is exactly what needed to happen give me the impetus 