In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [27]:
import sys
sys.version

'3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]'

In [28]:
! pip install nltk



#### 1. Load the dataset (5 points)
a. Tip: As the dataset is large, use fewer rows. Check what is working well on your machine and 
decide accordingly.

In [29]:
df1 = pd.read_csv('blogtext.csv')
df1.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [30]:
data_new.isnull().sum()

id           0
gender       0
age          0
topic        0
sign         0
date         0
text      3000
labels       0
dtype: int64

#### 2. Preprocess rows of the “text” column (7.5 points)
a. Remove unwanted characters
b. Convert text to lowercase
c. Remove unwanted spaces
d. Remove stopwords

In [31]:
df2 = df1[:3000]

In [None]:
df2['text'] = df2['text'].str.replace('[^A-Za-z]',' ')
df2['text'] = df2['text'].str.lower()
df2['text'] = df2['text'].str.strip()
df2['text'] = df2['text'].str.split()  
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
df2.text = df2.text.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

#### 3. As we want to make this into a multi-label classification problem, you are required to merge all the label 
columns together, so that we have all the labels together for a particular sentence (7.5 points)
a. Label columns to merge: “gender”, “age”, “topic”, “sign”
b. After completing the previous step, there should be only two columns in your data frame i.e. “text” 
and “labels” as shown in the below image

In [None]:
df2['age'] = df2['age'].astype(str)
df2['labels']=df2[['gender','age','topic','sign']].apply(lambda x:','.join(x), axis = 1) 
merged_df=df2.drop(labels =['date','gender', 'age','topic','sign','id'], axis = 1)
merged_df.head()

#### 4. Separate features and labels, and split the data into training and testing (5 points)

In [None]:
X = merged_df['text']
merged_df['labels'] = merged_df['labels'].str.lower()= mergedg_df['labels']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state = 43)

#### 5. Vectorize the features (5 points)
a. Create a Bag of Words using count vectorizer
i. Use ngram_range=(1, 2)
ii. Vectorize training and testing features
b. Print the term-document matrix

In [None]:
vectorizer = CountVectorizer(min_df = 2,ngram_range = (1,2),stop_words = "english")
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
vectorizer_labels = CountVectorizer(min_df = 1,ngram_range = (1,1),stop_words = "english")
labels_vector = vectorizer_labels.fit_transform(labels)
label_classes=[]
for  key in vectorizer_labels.vocabulary_.keys():
    label_classes.append(key)
MLB = MultiLabelBinarizer(classes = label_classes)

#### 7. Transform the labels - (7.5 points)
As we have noticed before, in this task each example can have multiple tags. To deal with such kind of 
prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. 
For this purpose, it is convenient to use MultiLabelBinarizer from sklearn
a. Convert your train and test labels using MultiLabelBinarizer

In [None]:
y = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in y]]
labels_trans = mlb.fit(labels) 
y_train = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in y_train]]
y_train = mlb.transform(y_train)
Y_test = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in y_test]]
y_test_trans = mlb.transform(y_test) 

#### 8. Choose a classifier - (5 points)
In this task, we suggest using the One-vs-Rest approach, which is implemented in OneVsRestClassifier
class. In this approach k classifiers (= number of tags) are trained. As a basic classifier, use 
LogisticRegression. It is one of the simplest methods, but often it performs good enough in text 
classification tasks. It might take some time because the number of classifiers to train is large.
a. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label
b. As One-vs-Rest approach might not have been discussed in the sessions, we are providing you 
with the code for that

In [None]:
clf = LogisticRegression(solver = 'lbfgs',max_iter = 1000) 
clf = OneVsRestClassifier(clf)
clf.fit(X_train,Y_train)

In [None]:
print("Training Accuracy:",clf.score(X_train,y_train))

#### 9. Fit the classifier, make predictions and get the accuracy (5 points)
a. Print the following
i. Accuracy score
ii. F1 score
iii. Average precision score
iv. Average recall score
v. Tip: Make sure you are familiar with all of them. How would you expect the things to 
work for the multi-label scenario? Read about micro/macro/weighted averaging

In [None]:
y_pred = clf.predict(X_test)
print("Test Accuracy:" + str(accuracy_score(y_test,y_pred)))
print("F1: " + str(f1_score(y_test,y_pred)))
print("F1_macro: " + str(f1_score(y_test,y_pred)))
print("Precision: " + str(precision_score(y_test,y_pred)))

#### 10. Print true label and predicted label for any five examples (7.5 points)

In [None]:
print(" Predicted :",y_pred[24])
print(" Actual :",y_test[24])

In [None]:
print(" Predicted :",y_pred[55])
print(" Actual :",y_test[55])