In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

import warnings
warnings.filterwarnings('ignore')

In [3]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('punkt')
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
# count vectorizer is for bag of words, tfid vectorizer is for tfidf

from string import punctuation

nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
from sklearn.model_selection import train_test_split
from collections import Counter

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,classification_report
from sklearn.multiclass import OneVsRestClassifier


## **Step 1. Import and analyse the data set**

In [5]:
df_temp=pd.read_csv('/content/drive/MyDrive/Mod 10- NLP/Project-1/PART1/blogtext.csv')

In [6]:
df=df_temp.iloc[0:10000,]

In [7]:
df.head(7)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
5,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",I had an interesting conversation...
6,3581210,male,33,InvestmentBanking,Aquarius,"10,June,2004",Somehow Coca-Cola has a way of su...


In [None]:
df.shape

(10000, 7)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10000 non-null  int64 
 1   gender  10000 non-null  object
 2   age     10000 non-null  int64 
 3   topic   10000 non-null  object
 4   sign    10000 non-null  object
 5   date    10000 non-null  object
 6   text    10000 non-null  object
dtypes: int64(2), object(5)
memory usage: 547.0+ KB


In [None]:
df.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [None]:
df.isna().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [None]:
df['id'].nunique()

214

In [None]:
df['id'].value_counts()

589736     2294
883178     1616
2821801     605
1103575     558
766556      532
           ... 
4116577       1
3544864       1
3337329       1
3688178       1
4198080       1
Name: id, Length: 214, dtype: int64

In [None]:
df['topic'].nunique()

26

In [None]:
df['topic'].value_counts()

indUnk                     3287
Technology                 2654
Fashion                    1622
Student                    1137
Education                   270
Marketing                   156
Engineering                 127
Internet                    118
Communications-Media         99
BusinessServices             91
Sports-Recreation            80
Non-Profit                   71
InvestmentBanking            70
Science                      63
Arts                         45
Consulting                   21
Museums-Libraries            17
Banking                      16
Automotive                   14
Law                          11
LawEnforcement-Security      10
Religion                      9
Accounting                    4
Publishing                    4
Telecommunications            2
HumanResources                2
Name: topic, dtype: int64

## **Step 2: Perform data pre-processing on the data**

## **Step 2.1: Data cleansing by removing unwanted characters, spaces, stop words etc. Convert text to lowercase.**

## **Removing special characters from the dataset 'text' using the 're" library**

In [8]:
df['text']=df['text'].apply(lambda x : re.sub('[@,.,^,$,*,?,\,/,\n,\t,<,>,&,:,\(,\),+,\-,!,+,-,\']','',x))

In [None]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",Info has been found 100 pages and ...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members Drewes...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoos Toolbar I can no...


# **Converting every character to lower case characters and striping spaces from each feature**

In [9]:
for col in df.columns:
    temp=df[col]
    if temp.dtype == object:
        df[col]=df[col].apply(lambda x : x.lower())
        df[col]=df[col].apply(lambda x : x.strip())

In [None]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,student,leo,"14,may,2004",info has been found 100 pages and 45 mb of pd...
1,2059027,male,15,student,leo,"13,may,2004",these are the team members drewes van der la...
2,2059027,male,15,student,leo,"12,may,2004",in het kader van kernfusie op aarde maak je e...
3,2059027,male,15,student,leo,"12,may,2004",testing testing
4,3581210,male,33,investmentbanking,aquarius,"11,june,2004",thanks to yahoos toolbar i can now capture the...


## **Tokenizing the Text data**

In [10]:
stop_words = stopwords.words('english') + list(punctuation)
 
def tokenize(text):
    words = word_tokenize(text)
    words = [w.lower() for w in words]
    #return [w for w in words if w not in stop_words and not w.isdigit()]
    return [w for w in words if w not in stop_words]

In [11]:
df['Processed_text']=df['text'].apply(lambda x: tokenize(x))

In [None]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,Processed_text
0,2059027,male,15,student,leo,"14,may,2004",info has been found 100 pages and 45 mb of pd...,"[info, found, 100, pages, 45, mb, pdf, files, ..."
1,2059027,male,15,student,leo,"13,may,2004",these are the team members drewes van der la...,"[team, members, drewes, van, der, laag, urllin..."
2,2059027,male,15,student,leo,"12,may,2004",in het kader van kernfusie op aarde maak je e...,"[het, kader, van, kernfusie, op, aarde, maak, ..."
3,2059027,male,15,student,leo,"12,may,2004",testing testing,"[testing, testing]"
4,3581210,male,33,investmentbanking,aquarius,"11,june,2004",thanks to yahoos toolbar i can now capture the...,"[thanks, yahoos, toolbar, capture, urls, popup..."


In [12]:
for i in range(0,df.shape[0]):
    df['Processed_text'][i]=' '.join(df['Processed_text'][i])

In [13]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,Processed_text
0,2059027,male,15,student,leo,"14,may,2004",info has been found 100 pages and 45 mb of pd...,info found 100 pages 45 mb pdf files wait unti...
1,2059027,male,15,student,leo,"13,may,2004",these are the team members drewes van der la...,team members drewes van der laag urllink mail ...
2,2059027,male,15,student,leo,"12,may,2004",in het kader van kernfusie op aarde maak je e...,het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,student,leo,"12,may,2004",testing testing,testing testing
4,3581210,male,33,investmentbanking,aquarius,"11,june,2004",thanks to yahoos toolbar i can now capture the...,thanks yahoos toolbar capture urls popupswhich...


## **Performing Lemmatization**

In [14]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
df['Processed_text'] = df['Processed_text'].apply(lambda x: ' '.join([lemma.lemmatize(x) for x in x.split()]))

In [None]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,Processed_text
0,2059027,male,15,student,leo,"14,may,2004",info has been found 100 pages and 45 mb of pd...,info found 100 page 45 mb pdf file wait untill...
1,2059027,male,15,student,leo,"13,may,2004",these are the team members drewes van der la...,team member drewes van der laag urllink mail r...
2,2059027,male,15,student,leo,"12,may,2004",in het kader van kernfusie op aarde maak je e...,het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,student,leo,"12,may,2004",testing testing,testing testing
4,3581210,male,33,investmentbanking,aquarius,"11,june,2004",thanks to yahoos toolbar i can now capture the...,thanks yahoo toolbar capture url popupswhich m...


Dropping Date from the original dataset

In [15]:
df.drop('date',inplace=True,axis=1)

## **Step 2.2: Target/label merger and transformation**

Creating new feature in the original dataset with the name labels, This will contain the list generated by concatenating the follwoing four featurs Gender, Age, Topic,Sign

In [16]:
 # Age is Numeric , Converting it to str. This is required when leveraging  Multibinarizer
df['age']=df['age'].astype('str')

In [17]:
df['labels']=' '

In [18]:
for i in range(0,df.shape[0]):
    new_label=[]
    new_label.append(df['gender'][i])
    new_label.append(df['age'][i])
    new_label.append(df['topic'][i])
    new_label.append(df['sign'][i])
    df['labels'][i]=new_label

In [None]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,text,Processed_text,labels
0,2059027,male,15,student,leo,info has been found 100 pages and 45 mb of pd...,info found 100 page 45 mb pdf file wait untill...,"[male, 15, student, leo]"
1,2059027,male,15,student,leo,these are the team members drewes van der la...,team member drewes van der laag urllink mail r...,"[male, 15, student, leo]"
2,2059027,male,15,student,leo,in het kader van kernfusie op aarde maak je e...,het kader van kernfusie op aarde maak je eigen...,"[male, 15, student, leo]"
3,2059027,male,15,student,leo,testing testing,testing testing,"[male, 15, student, leo]"
4,3581210,male,33,investmentbanking,aquarius,thanks to yahoos toolbar i can now capture the...,thanks yahoo toolbar capture url popupswhich m...,"[male, 33, investmentbanking, aquarius]"


In [19]:
df.drop(['id','gender','age','topic','sign'],inplace=True,axis=1)

In [20]:
df.head()

Unnamed: 0,text,Processed_text,labels
0,info has been found 100 pages and 45 mb of pd...,info found 100 page 45 mb pdf file wait untill...,"[male, 15, student, leo]"
1,these are the team members drewes van der la...,team member drewes van der laag urllink mail r...,"[male, 15, student, leo]"
2,in het kader van kernfusie op aarde maak je e...,het kader van kernfusie op aarde maak je eigen...,"[male, 15, student, leo]"
3,testing testing,testing testing,"[male, 15, student, leo]"
4,thanks to yahoos toolbar i can now capture the...,thanks yahoo toolbar capture url popupswhich m...,"[male, 33, investmentbanking, aquarius]"


This data is highly imbalanced. Lets create a dictionary to get label counts


## **We create dictionary to get frequency count of the unique label data.**

In [None]:
new_dict=dict()
gender=[]
age=[]
occ=[]
sign=[]

In [None]:
for item in df['labels']:
    i=0
    for value in item:
        if i==0:
            gender.append(value)
        if i==1:
            age.append(value)
        if i==2:
            occ.append(value)
        if i==3:
            sign.append(value)
        i+=1

In [None]:
dict_age=Counter(age)
dict_gender=Counter(gender)
dict_occ=Counter(occ)
dict_sign=Counter(sign)

In [None]:
dict_gender

Counter({'female': 4084, 'male': 5916})

In [None]:
dict_occ

Counter({'accounting': 4,
         'arts': 45,
         'automotive': 14,
         'banking': 16,
         'businessservices': 91,
         'communications-media': 99,
         'consulting': 21,
         'education': 270,
         'engineering': 127,
         'fashion': 1622,
         'humanresources': 2,
         'indunk': 3287,
         'internet': 118,
         'investmentbanking': 70,
         'law': 11,
         'lawenforcement-security': 10,
         'marketing': 156,
         'museums-libraries': 17,
         'non-profit': 71,
         'publishing': 4,
         'religion': 9,
         'science': 63,
         'sports-recreation': 80,
         'student': 1137,
         'technology': 2654,
         'telecommunications': 2})

In [None]:
dict_sign

Counter({'aquarius': 571,
         'aries': 4198,
         'cancer': 504,
         'capricorn': 215,
         'gemini': 150,
         'leo': 301,
         'libra': 491,
         'pisces': 454,
         'sagittarius': 1097,
         'scorpio': 971,
         'taurus': 812,
         'virgo': 236})

In [None]:
def merge_two_dicts(a, b, c, d):
    z = a.copy()   # start with x's keys and values
    z.update(b)    # modifies z with y's keys and values & returns None
    z.update(c)
    z.update(d)
    return z

In [None]:
Final_dict=merge_two_dicts(dict_age, dict_gender, dict_occ, dict_sign)

In [None]:
Final_dict

Counter({'13': 42,
         '14': 212,
         '15': 602,
         '16': 440,
         '17': 1185,
         '23': 253,
         '24': 655,
         '25': 386,
         '26': 234,
         '27': 1054,
         '33': 136,
         '34': 553,
         '35': 2315,
         '36': 1708,
         '37': 33,
         '38': 46,
         '39': 79,
         '40': 1,
         '41': 20,
         '42': 14,
         '43': 6,
         '44': 3,
         '45': 16,
         '46': 7,
         'accounting': 4,
         'aquarius': 571,
         'aries': 4198,
         'arts': 45,
         'automotive': 14,
         'banking': 16,
         'businessservices': 91,
         'cancer': 504,
         'capricorn': 215,
         'communications-media': 99,
         'consulting': 21,
         'education': 270,
         'engineering': 127,
         'fashion': 1622,
         'female': 4084,
         'gemini': 150,
         'humanresources': 2,
         'indunk': 3287,
         'internet': 118,
         'investmentban

## **Step 2.3: Train and test split**

In [21]:
X=df['Processed_text']
y=df['labels']

In [None]:
X

0       info found 100 page 45 mb pdf file wait untill...
1       team member drewes van der laag urllink mail r...
2       het kader van kernfusie op aarde maak je eigen...
3                                         testing testing
4       thanks yahoo toolbar capture url popupswhich m...
                              ...                        
9995    take home forever may rest sleep arm forgotten...
9997    kind need holding hand petting hair cry bring ...
9998    blurry outside sound people mingle pas darknes...
9999    body feel broken mind rejoices thought warmth ...
Name: Processed_text, Length: 10000, dtype: object

In [None]:
y

0                      [male, 15, student, leo]
1                      [male, 15, student, leo]
2                      [male, 15, student, leo]
3                      [male, 15, student, leo]
4       [male, 33, investmentbanking, aquarius]
                         ...                   
9995               [female, 25, indunk, pisces]
9996               [female, 25, indunk, pisces]
9997               [female, 25, indunk, pisces]
9998               [female, 25, indunk, pisces]
9999               [female, 25, indunk, pisces]
Name: labels, Length: 10000, dtype: object

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X,y)

In [None]:
X_train.sample(5)

631     think im good mood let workin week 9 5 money w...
4478    day better think thing rough find someone else...
9348    wellyankees lost __ pitiful oh wellthere alway...
6520    personal drama died bit since last friday thou...
78      ive fallen deep fast dont know know feel good ...
Name: Processed_text, dtype: object

In [None]:
X_test.sample(5)

9080    little green hill could see nothing pain dark ...
3013    yethes havin rely ol angie mcmoneybags meal sa...
974     case anyone wondering one probably read anyway...
9912    mind creates world found quote buddha looking ...
3475    thought might enjoy urllink alien language con...
Name: Processed_text, dtype: object

## **Step 2.4: Vectorisation**

In [23]:
vect=CountVectorizer(ngram_range=(1,2))
#vect=TfidfVectorizer(n_gram_range=(1,2))

In [24]:
X_train_dtm=vect.fit_transform(X_train)

In [25]:
X_test_dtm=vect.transform(X_test)

In [26]:
X_train_dtm.shape

(7500, 520302)

In [27]:
X_test_dtm.shape

(2500, 520302)

## **Implementing MultiLabelBinarizer to create binary labels from the list in the labels dataset**

In [None]:
y_train

5624             [female, 27, indunk, taurus]
2301            [male, 35, technology, aries]
3959    [female, 25, businessservices, aries]
9838     [female, 23, museums-libraries, leo]
6040             [female, 27, indunk, taurus]
                        ...                  
3063            [male, 35, technology, aries]
5799             [female, 27, indunk, taurus]
3942             [female, 35, indunk, gemini]
8035            [female, 15, student, pisces]
4323        [female, 34, indunk, sagittarius]
Name: labels, Length: 7500, dtype: object

In [28]:
mlb=MultiLabelBinarizer()
y_train_mlb=mlb.fit_transform(y_train)
y_test_mlb=mlb.transform(y_test)

In [29]:
y_train_mlb

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y_train_mlb.shape

(7500, 64)

## **Below are the unique labels in the labels features in 10000 dataset**

In [None]:
mlb.classes_

array(['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', 'accounting', 'aquarius', 'aries', 'arts',
       'automotive', 'banking', 'businessservices', 'cancer', 'capricorn',
       'communications-media', 'consulting', 'education', 'engineering',
       'fashion', 'female', 'gemini', 'humanresources', 'indunk',
       'internet', 'investmentbanking', 'law', 'lawenforcement-security',
       'leo', 'libra', 'male', 'marketing', 'museums-libraries',
       'non-profit', 'pisces', 'publishing', 'religion', 'sagittarius',
       'science', 'scorpio', 'sports-recreation', 'student', 'taurus',
       'technology', 'telecommunications', 'virgo'], dtype=object)

In [None]:
X_train_dtm.shape

(7500, 519735)

In [None]:
y_train_mlb.shape

(7500, 64)

## **Step 3 :Design, train, tune and test the best text classifier.**

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,recall_score,precision_score,f1_score

We will use the OneVsRestClassifier.Also known as one-vs-all, this strategy consists in fitting one classifier per class

In [None]:
LogReg_pipeline=Pipeline([('clf',OneVsRestClassifier(LogisticRegression(solver='lbfgs')))])

We have tried to tune the model for different values of hyperparameters like solver :‘newton-cg’, ‘saga’ and ‘lbfgs’and C:Inverse of regularization strength.
However, Below hyperparameter values give the best results

In [None]:
LogReg_pipeline.fit(X_train_dtm,y_train_mlb)

Pipeline(memory=None,
         steps=[('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(C=1.0,
                                                                  class_weight=None,
                                                                  dual=False,
                                                                  fit_intercept=True,
                                                                  intercept_scaling=1,
                                                                  l1_ratio=None,
                                                                  max_iter=100,
                                                                  multi_class='auto',
                                                                  n_jobs=None,
                                                                  penalty='l2',
                                                                  random_state=None,
                                                      

In [None]:
prediction = LogReg_pipeline.predict(X_test_dtm)

In [None]:
accuracy_score(y_test_mlb,prediction)

0.306

In [None]:
#  Printing inverse data from the predicted labels
mlb.inverse_transform(prediction[0:5,])

[('16', 'cancer', 'indunk', 'male'),
 ('35', 'aries', 'male', 'technology'),
 ('36', 'aries', 'fashion', 'male'),
 ('35', 'aries', 'male', 'technology'),
 ('34', 'female', 'indunk', 'sagittarius')]

In [None]:
y_test.head(5)

9467           [male, 16, indunk, cancer]
3228        [male, 35, technology, aries]
6518           [male, 36, fashion, aries]
9912           [male, 23, student, aries]
4430    [female, 34, indunk, sagittarius]
Name: labels, dtype: object

In [None]:
recall_score(y_test_mlb,prediction,average='micro')

0.5348139255702281

In [None]:
recall_score(y_test_mlb,prediction,average='macro')

0.1926537265498123

In [None]:
precision_score(y_test_mlb,prediction,average='micro')

0.7687661777394306

In [None]:
precision_score(y_test_mlb,prediction,average='macro')

0.5084594401012175

In [None]:
f1_score(y_test_mlb,prediction,average='micro')

0.6307964601769911

In [None]:
f1_score(y_test_mlb,prediction,average='macro')

0.25431032319561747

## **Step 4: Display and explain detail the classification report**

In [None]:
print(classification_report(y_test_mlb,prediction))

              precision    recall  f1-score   support

           0       1.00      0.15      0.27        13
           1       1.00      0.13      0.23        54
           2       0.65      0.21      0.32       142
           3       0.71      0.17      0.27       121
           4       0.74      0.29      0.42       299
           5       0.00      0.00      0.00        66
           6       0.54      0.09      0.16       149
           7       0.67      0.12      0.20        84
           8       0.75      0.05      0.10        59
           9       0.75      0.33      0.46       245
          10       1.00      0.27      0.43        37
          11       0.97      0.63      0.76       124
          12       0.68      0.65      0.66       603
          13       0.91      0.51      0.65       452
          14       0.00      0.00      0.00        12
          15       0.50      0.11      0.18         9
          16       0.50      0.12      0.20        16
          17       0.00    

We see that y_train has 64 classes as follows:
 ['13', '14', '15', '16', '17', '23', '24', '25', '26', '27', '33',
       '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
       '45', '46', 'aquarius', 'aries', 'arts', 'automotive', 'banking',
       'businessservices', 'cancer', 'capricorn', 'communications-media',
       'consulting', 'education', 'engineering', 'fashion', 'female',
       'gemini', 'humanresources', 'indunk', 'internet',
       'investmentbanking', 'law', 'lawenforcement-security', 'leo',
       'libra', 'male', 'marketing', 'museums-libraries', 'non-profit',
       'pisces', 'publishing', 'religion', 'sagittarius', 'science',
       'scorpio', 'sports-recreation', 'student', 'taurus', 'technology',
       'telecommunications', 'virgo']

Classification report gives Precision,Recall ,f1-score and support for each of these classes separately.


*   We see that Precision for some classes like Age-13,14,Virgo,Sports-Recreation is 100% while some classes like telecommunications,religion is 0%. Mostly classes have a high precision greater than 50%. It means that from the total predictions our model has made, approx 50% are correct.
*   Recall for most classes is low. Some classes  like telecommunications,religion,publishing have 0% recall.So, from actual positive classes, Our model was able to find less than 50% of them.


*   Upon closer inspection we see that few classes have both recall and precision 0%. Hence their f1-score is also 0. These are the minority classes in our dataset.
* Also, their support values is very low - a single digit number.
*   Our data was highly imbalanced. Hence the model was biased in predicting the majority classes correctly. The minority classes were under-represented and could not be correclty predicted by our model. Such classes have both Recall and precion as 0.
* But the classes having higher support, have good precision values. Classes having very high support(>600), the majority classes, have good recall as well.

*   f1-score is the harmonic mean of recall and precion and gives values accordingly.











### Support is the number of actual occurrences of the class in the specified dataset. Imbalanced support in the training data may indicate structural weaknesses in the reported scores of the classifier and could indicate the need for stratified sampling or rebalancing.

# **Step 5: Print the true vs predicted labels for any 5 entries from the dataset.**

In [None]:
mlb.inverse_transform(prediction[0:5])

[('16', 'cancer', 'indunk', 'male'),
 ('35', 'aries', 'male', 'technology'),
 ('36', 'aries', 'fashion', 'male'),
 ('35', 'aries', 'male', 'technology'),
 ('34', 'female', 'indunk', 'sagittarius')]

In [None]:
y_test[0:5]

9467           [male, 16, indunk, cancer]
3228        [male, 35, technology, aries]
6518           [male, 36, fashion, aries]
9912           [male, 23, student, aries]
4430    [female, 34, indunk, sagittarius]
Name: labels, dtype: object

## **Decision Tree Classifier**

In [31]:
from sklearn import tree

DT_model= tree.DecisionTreeClassifier(random_state=1)

In [None]:
model2=OneVsRestClassifier(DT_model).fit(X_train_dtm,y_train_mlb)

In [None]:
prediction = model2.predict(X_test_dtm)

In [None]:
accuracy_score(y_test_mlb,prediction)

0.1552

In [None]:
recall_score(y_test_mlb,prediction,average='micro'),recall_score(y_test_mlb,prediction,average='macro')

(0.4953, 0.2839173110631841)

In [None]:
precision_score(y_test_mlb,prediction,average='micro'),precision_score(y_test_mlb,prediction,average='macro')

(0.6067622197721426, 0.4295018131835531)

In [None]:
f1_score(y_test_mlb,prediction,average='micro'),f1_score(y_test_mlb,prediction,average='macro')

(0.5453944832902055, 0.335827368403793)

## **Tuning the model**

In [32]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth':[1,2,3,4,5],
              'min_samples_leaf':[1,2,3,4,5],
              'min_samples_split':[1,2,3,4,5,6]}
grid = GridSearchCV(DT_model, param_grid)


In [1]:
model3=OneVsRestClassifier(grid).fit(X_train_dtm,y_train_mlb)

In [None]:
y_predict_Grid = model3.predict(X_test_dtm)

In [None]:
accuracy_score(y_test_mlb,y_predict_Grid)

In [None]:
recall_score(y_test_mlb,prediction,average='micro'),recall_score(y_test_mlb,prediction,average='macro')

In [None]:
precision_score(y_test_mlb,prediction,average='micro'),precision_score(y_test_mlb,prediction,average='macro')

In [None]:
f1_score(y_test_mlb,prediction,average='micro'),f1_score(y_test_mlb,prediction,average='macro')

In [None]:
mlb.inverse_transform(prediction[0:5])

In [None]:
y_test[0:5]

Tuning this model could not be completed due to insufficient RAM, which was causing session to repeatedly crash.