# Statistical NLP - Author Features Prediction

#### 1.Load the dataset (5 points)
a.Tip: As the dataset is large, use fewer rows. Check what is working well on your machine and decide accordingly.

#### Read the csv using pandas

In [51]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

import warnings
warnings.filterwarnings('ignore')

In [52]:
df = pd.read_csv("blogtext.csv")
df.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


#### Get the names of the columns

In [53]:
df.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

#### Check if there is any null value, and get the total count.

In [54]:
df.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

### Cutting the data 

In [55]:
df = df.head(5000)

### 2.Preprocess rows of the “text” column (7.5 points)
a.Remove unwanted characters

b.Convert text to lowercase

c.Remove unwanted spaces

d.Remove stopwords

In [56]:
# a.Remove unwanted characters
df.text = df.text.apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))

In [57]:
#b. Convert text to lowercase
df.text = df.text.apply(lambda x: x.lower())

In [58]:
#c.Remove unwanted spaces
df.text = df.text.apply(lambda x: x.strip())

In [59]:
#d. Remove stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
df.text = df.text.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

[nltk_data] Downloading package stopwords to C:\Users\Windows
[nltk_data]     10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Checking the data in the text column  

In [60]:
df.text[4]

'thanks yahoo toolbar capture urls popups means show cool links korean pop k pop audio video without need relate instructions like go site click pop audio button choose without ado link hour k pop urllink audio urllink video streaming enjoy'

### 3.As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence (7.5 points)
a.Label columns to merge: “gender”, “age”, “topic”, “sign”

### Merge all the label columns

In [61]:
df['labels'] = df.apply(lambda row: [row['gender'], str(row['age']), row['topic'], row['sign']], axis=1)

In [62]:
df = df[['text','labels']]

### b.After completing the previous step, there should be only two columns in your data frame i.e. “text” and “labels” as shown in the below image

In [63]:
df.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


### 4. Separate features and labels, and split the data into training and testing (5 points)

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.text.values, df.labels.values, test_size = 0.20, random_state = 10)

### 5.Vectorize the features (5 points)
a.Create a Bag of Words using count vectorizer

i.Use ngram_range=(1, 2)

ii.Vectorize training and testing features

In [65]:
vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [66]:
vectorizer.get_feature_names()[:5]

['aa', 'aa amazing', 'aa anger', 'aa compared', 'aa nice']

#### b.Print the term-document matrix

In [67]:
X_train_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### 6.Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label. Check below image for reference (5 points)

In [68]:
label_counts = dict()

for labels in df.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1

In [69]:
label_counts

{'male': 3294,
 '15': 339,
 'Student': 569,
 'Leo': 190,
 '33': 101,
 'InvestmentBanking': 70,
 'Aquarius': 329,
 'female': 1706,
 '14': 170,
 'indUnk': 1381,
 'Aries': 2483,
 '25': 268,
 'Capricorn': 84,
 '17': 331,
 'Gemini': 86,
 '23': 137,
 'Non-Profit': 47,
 'Cancer': 94,
 'Banking': 16,
 '37': 19,
 'Sagittarius': 704,
 '26': 96,
 '24': 353,
 'Scorpio': 408,
 '27': 86,
 'Education': 118,
 '45': 14,
 'Engineering': 119,
 'Libra': 414,
 'Science': 33,
 '34': 540,
 '41': 14,
 'Communications-Media': 61,
 'BusinessServices': 87,
 'Sports-Recreation': 75,
 'Virgo': 41,
 'Taurus': 100,
 'Arts': 31,
 'Pisces': 67,
 '44': 3,
 '16': 67,
 'Internet': 20,
 'Museums-Libraries': 2,
 'Accounting': 2,
 '39': 79,
 '35': 2307,
 'Technology': 2332,
 '36': 60,
 'Law': 3,
 '46': 7,
 'Consulting': 16,
 'Automotive': 14,
 '42': 9,
 'Religion': 4}

### 7.Transform the labels - (7.5 points)
### As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose, it is convenient to use MultiLabelBinarizerfrom sklearn
### a.Convert your train and test labels using MultiLabelBinarizer

In [70]:
mlb = MultiLabelBinarizer(classes=sorted(label_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

### 8.Choose a classifier - (5 points)
### In this task, we suggest using the One-vs-Rest approach, which is implemented in OneVsRestClassifierclass. In this approach k classifiers (= number of tags) are trained. As a basic classifier, use LogisticRegression. It is one of the simplest methods, but often it performs good enoughin text classification tasks. It might take some time because the number of classifiers to train is large.
### a.Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label
### b.As One-vs-Rest approach might not have been discussed in the sessions, we are providing you with the code for that

In [71]:
clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

#### Fit the classifier

In [72]:
clf.fit(X_train_bow, y_train)

OneVsRestClassifier(estimator=LogisticRegression())

### 9.Fit the classifier, make predictions and get the accuracy (5 points)
#### a.Print the following
- i.Accuracy score
- ii.F1 score
- iii.Average precision score
- iv.Average recall score
- v.Tip: Make sure you are familiar with all of them. How would you expect the things to work for the multi-label scenario? Read about micro/macro/weighted averaging

In [73]:
def print_evaluation_scores(y_val, predicted):
    print('Accuracy score: ', accuracy_score(y_val, predicted))
    print('F1 score: ', f1_score(y_val, predicted, average='micro'))
    print('Average precision score: ', average_precision_score(y_val, predicted, average='micro'))
    print('Average recall score: ', recall_score(y_val, predicted, average='micro'))

In [74]:
print('Bag-of-words')
print_evaluation_scores(y_test, predicted_labels)

Bag-of-words
Accuracy score:  0.512
F1 score:  0.7306492770059542
Average precision score:  0.5699773430109875
Average recall score:  0.64425


### 10.Print true label and predicted label for any five examples (7.5 points)

In [75]:
predicted_labels = clf.predict(X_test_bow)
predicted_scores = clf.decision_function(X_test_bow)

In [76]:
pred_inversed = mlb.inverse_transform(predicted_labels)
y_test_inversed = mlb.inverse_transform(y_test)

### Print any five examples samples

In [77]:
for i in range(5):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(pred_inversed[i])
    ))

Title:	urllink slave mind life going towards directions knew wrong yet went ahead love someone even know pain yearn listen voice listen sorrows sacrifice everything happiness slave mind holding us chains overmind even one love betrayed often would cling illusion mind fed us love strange feeling amount science maths measure decipher feels like even words could descibe ones love ones hurt prince shadow
True labels:	15,Aquarius,Student,male
Predicted labels:	Student,female


Title:	munday relaxing weekend little divas went grandma got sleep saturday sunday nothing delicious sleeping weekends hair back rich brown sans gray espresso name box extra hours beauty sleep dip dye feeling fabulous may even paint toenails tonight ooh decadence littlest diva pre k teacher came great way avoiding morning separation anxiety drama subjected two weeks instead prying leg attempt escape classroom teacher meets us cafeteria way smoother transition daughter student know works left little girl blue beige uni