# Question 1a: Get to know the Data: WordCloud

In [1]:
# Start with loading all necessary libraries
import numpy as np
import pandas as pd
from os import path
import re
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import string
from collections import defaultdict

#sklearn libraries
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words as sk_sw
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random
from sklearn.linear_model import LogisticRegression

#keras libraries for CNN
#import tensorflow
#import keras
#from keras.models import Sequential
#from keras import layers




In [2]:
#library for stop words
import nltk
nltk.download()
from nltk.corpus import stopwords  
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [3]:
dataset = pd.read_csv('train.csv')

df = pd.concat([dataset['Title'], dataset['Content'], dataset['Label']], axis=1)
df

Unnamed: 0,Title,Content,Label
0,"Netflix is coming to cable boxes, and Amazon i...",if you subscribe to one of three rinky-dink (...,Entertainment
1,"Pharrell, Iranian President React to Tehran 'H...","pharrell, iranian president react to tehran '...",Entertainment
2,Wildlife service seeks comments,the u.s. fish and wildlife service has reopen...,Technology
3,Facebook teams up with Storyful to launch 'FB ...,the very nature of social media means it is o...,Technology
4,Caesars plans US$880 mln New York casino,caesars plans us$880 mln new york casino jul ...,Business
...,...,...,...
111790,Microsoft requires Office 2013 licensing for s...,in contrast to the muckle of special licenses...,Technology
111791,Smallpox vials missing since 1950s found in la...,government workers at a research center near ...,Health
111792,Scientists May Have Just Discovered the Key to...,harvard scientists may have just unlocked the...,Health
111793,Justin Bieber to plead guilty to DUI,"justin bieber to plead guilty to duifri, 13 ju...",Entertainment


In [4]:
#Joining Title and Content column and separating them with '---' symbol 
df['United'] = df['Title'] + ' ' + df['Content']
df

Unnamed: 0,Title,Content,Label,United
0,"Netflix is coming to cable boxes, and Amazon i...",if you subscribe to one of three rinky-dink (...,Entertainment,"Netflix is coming to cable boxes, and Amazon i..."
1,"Pharrell, Iranian President React to Tehran 'H...","pharrell, iranian president react to tehran '...",Entertainment,"Pharrell, Iranian President React to Tehran 'H..."
2,Wildlife service seeks comments,the u.s. fish and wildlife service has reopen...,Technology,Wildlife service seeks comments the u.s. fish...
3,Facebook teams up with Storyful to launch 'FB ...,the very nature of social media means it is o...,Technology,Facebook teams up with Storyful to launch 'FB ...
4,Caesars plans US$880 mln New York casino,caesars plans us$880 mln new york casino jul ...,Business,Caesars plans US$880 mln New York casino caes...
...,...,...,...,...
111790,Microsoft requires Office 2013 licensing for s...,in contrast to the muckle of special licenses...,Technology,Microsoft requires Office 2013 licensing for s...
111791,Smallpox vials missing since 1950s found in la...,government workers at a research center near ...,Health,Smallpox vials missing since 1950s found in la...
111792,Scientists May Have Just Discovered the Key to...,harvard scientists may have just unlocked the...,Health,Scientists May Have Just Discovered the Key to...
111793,Justin Bieber to plead guilty to DUI,"justin bieber to plead guilty to duifri, 13 ju...",Entertainment,Justin Bieber to plead guilty to DUI justin bi...


### Stop Wrods

In [5]:
# Create stopword list:
nltk_sw = stopwords.words('english')
sklearn_stop_words = sk_sw.ENGLISH_STOP_WORDS

In [6]:
#Load from text file stop word for better performance
#the stop words file was taking from http://members.unine.ch/jacques.savoy/clef/englishST.txt
sw_list = []
with open('StopWords.txt', 'r') as f:
    [sw_list.append(word) for line in f for word in line.split()]

print("Lenghs of Stop Words from file ", len(sw_list))
print("Lenghs of Stop Words from NLTK Library ", len(nltk_sw))
print("Lenghs of Stop Words from Sklearn Library ", len(sklearn_stop_words))

Lenghs of Stop Words from file  571
Lenghs of Stop Words from NLTK Library  179
Lenghs of Stop Words from Sklearn Library  318


### Cleaning the texts

In [8]:
#Remove blank rows if any.
df['United'].dropna(inplace=True)

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

corpus = df['United']
                  
#Removing punctuations and numbers.                
corpus = corpus.str.replace('[^\w\s]','') #punctuations
corpus = corpus.str.replace('\d+', '') #numbers

In [9]:
def text_preprocessing(text):
    #Change all the text to lower case.
    text = text.lower()

    #Tokenization : In this each entry in the corpus will be broken into set of words
    text_words_list = word_tokenize(text)

    #Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
    #Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    
    #Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(text_words_list):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in sw_list and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
            Final_words.append(word_Final)
    return str(Final_words)

In [10]:
df['United'] = corpus.map(text_preprocessing)

In [None]:
df

### Creating Word Cloud

In [None]:
def make_word_cloud(input_text, mask, output_file_name, stopwords=None, extra_stopwords=None,
                    bckgrd_color = "white", max_words=2000):

    # Load the mask image
    mask = np.array(Image.open(mask))
    
    # Load stop word list
    stopwords = set(stopwords)
    
    # Add extra stop words if provided
    if extra_stopwords is not None:
        [stopwords.add(word) for word in extra_stopwords]
    
    # Call WordCloud
    wc = WordCloud(background_color = bckgrd_color, max_words = max_words, collocations = False, relative_scaling=0,
                   mode="RGBA", mask = mask, stopwords = stopwords)

    # Generate word cloud
    wc.generate(input_text)
    
    # create coloring from image
    image_colors = ImageColorGenerator(mask)
    plt.figure(figsize=[7,7])
    plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
    plt.axis("off")

    # store to file
    plt.savefig(output_file_name, format="png")

    plt.show()
    

In [None]:
tech = " ".join(text for text in df[df["Label"]=="Technology"].United)
enter = " ".join(text for text in df[df["Label"]=="Entertainment"].United)
busi = " ".join(text for text in df[df["Label"]=="Business"].United)
heal = " ".join(text for text in df[df["Label"]=="Health"].United)

In [None]:
# Generate a word cloud image for Techlonogy
make_word_cloud(input_text = tech, mask = "img/tech.jpg", output_file_name = "img/result_tech.png", stopwords = sw_list)

print("Word Cloud for Technology Label")

In [None]:
# Generate a word cloud image for Techlonogy
make_word_cloud(input_text = enter, mask = "img/entertainment.png", output_file_name = "img/result_entertainment.png", 
                stopwords = sw_list)

print("Word Cloud for Entertainment Label")

In [None]:
# Generate a word cloud image for Business 
make_word_cloud(input_text = busi, mask = "img/business.jpg", output_file_name = "img/result_business.png", stopwords = sw_list)

print("Word Cloud for Business Label")

In [None]:
# Generate a word cloud image for Health
make_word_cloud(input_text = heal, mask = "img/health.jpg", output_file_name = "img/result_health.png", stopwords = sw_list)

print("Word Cloud for Health Label")

# Question 1b: Classification Task

 ### Split the model into Train and Test Data set and Encoding

In [11]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df['United'], df['Label'], test_size=0.3)

In [12]:
#Label encode the target variable
#This is done to transform Categorical data of string type in the data set into numerical values
Encoder = LabelEncoder()
Encoder.fit(Train_Y)
Train_Y = Encoder.transform(Train_Y)
Test_Y = Encoder.transform(Test_Y)


In [None]:
y = Encoder.fit_transform(df['Label']) 

In [99]:
df['United']

0         ['netflix', 'come', 'cable', 'box', 'amazon', ...
1         ['pharrell', 'iranian', 'president', 'react', ...
2         ['wildlife', 'service', 'seek', 'comment', 'fi...
3         ['facebook', 'team', 'storyful', 'launch', 'fb...
4         ['caesar', 'plan', 'mln', 'york', 'casino', 'c...
                                ...                        
111790    ['microsoft', 'require', 'office', 'licensing'...
111791    ['smallpox', 'vial', 'miss', 'find', 'lab', 's...
111792    ['scientist', 'discover', 'key', 'reverse', 'a...
111793    ['justin', 'bieber', 'plead', 'guilty', 'dui',...
111794    ['tracy', 'morgan', 'upgrade', 'fair', 'condit...
Name: United, Length: 111795, dtype: object

### Creating the Bag of Words (BoW)

In [42]:
count_vec = CountVectorizer(max_features = 10000)
count_vec.fit(df['United'])
count_vec

CountVectorizer(max_features=10000)

In [14]:
spare_matrix = count_vec.transform(df['United'])

In [45]:
X = spare_matrix

In [16]:
Train_X_Vec = count_vec.transform(Train_X)
Test_X_Vec = count_vec.transform(Test_X)

In [41]:
Train_X_Vec

<78256x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 9198022 stored elements in Compressed Sparse Row format>

### SVM (BoW)

In [17]:
SVMClas_BoW = svm.SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto')
SVMClas_BoW.fit(Train_X_Vec, Train_Y)

SVC(gamma='auto', kernel='linear')

In [18]:
# predict the labels on validation dataset
predictions_SVM_BoW = SVMClas_BoW.predict(Test_X_Vec)

In [19]:
print(classification_report(Test_Y, predictions_SVM_BoW, target_names = df['Label'].unique()))

               precision    recall  f1-score   support

Entertainment       0.89      0.91      0.90      7519
   Technology       0.98      0.98      0.98     13464
     Business       0.96      0.94      0.95      3674
       Health       0.94      0.93      0.93      8882

     accuracy                           0.95     33539
    macro avg       0.94      0.94      0.94     33539
 weighted avg       0.95      0.95      0.95     33539



### Random Forests (BoW)

In [20]:
RFClas_BoW = RandomForestClassifier(n_estimators = 100)
RFClas_BoW.fit(Train_X_Vec, Train_Y)

RandomForestClassifier()

In [21]:
# predict the labels on validation dataset
predictions_RF_BoW = RFClas_BoW.predict(Test_X_Vec)

In [22]:
print(classification_report(Test_Y, predictions_RF_BoW, target_names = df['Label'].unique()))

               precision    recall  f1-score   support

Entertainment       0.91      0.90      0.91      7519
   Technology       0.96      0.98      0.97     13464
     Business       0.96      0.90      0.93      3674
       Health       0.92      0.92      0.92      8882

     accuracy                           0.94     33539
    macro avg       0.94      0.93      0.93     33539
 weighted avg       0.94      0.94      0.94     33539



### My Method - Logistic Regression 

In [23]:
LGClas_BoW = LogisticRegression(max_iter = 10000, random_state = 0)
LGClas_BoW.fit(Train_X_Vec, Train_Y)

LogisticRegression(max_iter=10000, random_state=0)

In [24]:
predictions_LG_BoW = LGClas_BoW.predict(Test_X_Vec)

In [25]:
print(classification_report(Test_Y, predictions_LG_BoW, target_names = df['Label'].unique()))

               precision    recall  f1-score   support

Entertainment       0.92      0.92      0.92      7519
   Technology       0.98      0.99      0.98     13464
     Business       0.97      0.96      0.96      3674
       Health       0.94      0.94      0.94      8882

     accuracy                           0.96     33539
    macro avg       0.95      0.95      0.95     33539
 weighted avg       0.96      0.96      0.96     33539



### Creating the Singular Value Decomposition (SVD)

In [26]:
svd = TruncatedSVD(n_components = 5, n_iter = 7, random_state = 42)
svd.fit(spare_matrix)

TruncatedSVD(n_components=5, n_iter=7, random_state=42)

In [27]:
svd.transform(spare_matrix)

array([[ 3.00349290e+00, -3.60326463e-01,  3.73416219e-03,
         1.61444397e+00, -5.84706006e-01],
       [ 2.04676370e+00,  4.79730706e-02, -6.36475373e-01,
        -5.01986644e-01,  1.25071841e-01],
       [ 3.99276871e+00,  1.38119389e-01, -8.50946907e-02,
         1.08318056e+00,  1.56007403e+00],
       ...,
       [ 4.73855162e+00, -1.07045394e-02, -1.80324545e+00,
        -2.19784195e+00,  5.40502806e+00],
       [ 1.44973560e+00, -2.18431832e-02, -5.85598230e-01,
        -3.63746721e-01, -2.23043373e-02],
       [ 1.37615154e+00,  2.81094068e-01, -5.73145649e-01,
        -5.01959499e-01,  3.00647377e-01]])

In [28]:
Train_X_svd = svd.transform(Train_X_Vec)
Test_X_svd = svd.transform(Test_X_Vec)

In [None]:
Test_X_svd.shape

In [37]:
Train_Y.shape

(78256,)

### SVM (SVD)

In [29]:
SVMClas_SVD = svm.SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto')
SVMClas_SVD.fit(Train_X_svd, Train_Y)

SVC(gamma='auto', kernel='linear')

In [30]:
# predict the labels on validation dataset
predictions_SVM_SVD = SVMClas_SVD.predict(Test_X_svd)

In [38]:
print(classification_report(Test_Y, predictions_SVM_SVD, target_names = df['Label'].unique()))

               precision    recall  f1-score   support

Entertainment       0.82      0.73      0.78      7519
   Technology       0.86      0.95      0.90     13464
     Business       0.82      0.75      0.78      3674
       Health       0.75      0.73      0.74      8882

     accuracy                           0.82     33539
    macro avg       0.81      0.79      0.80     33539
 weighted avg       0.82      0.82      0.82     33539



### Random Forests (SVD)

In [31]:
RFClas_SVD = RandomForestClassifier(n_estimators = 100)
RFClas_SVD.fit(Train_X_svd, Train_Y)

RandomForestClassifier()

In [32]:
predictions_RF_SVD = RFClas_SVD.predict(Test_X_svd)

In [None]:
print(classification_report(Test_Y, predictions_RF_SVD, target_names = df['Label'].unique()))

### My Method - Logistic Regression

In [33]:
LGClas_SVD = LogisticRegression(max_iter = 10000, random_state = 0)
LGClas_SVD.fit(Train_X_svd, Train_Y)

LogisticRegression(max_iter=10000, random_state=0)

In [34]:
predictions_LG_SVD = LGClas_SVD.predict(Test_X_svd)

In [None]:
print(classification_report(Test_Y, predictions_LG_SVD, target_names = df['Label'].unique()))

### 5-Fold Cross Validation

In [44]:
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)
SVMClf = svm.SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto')
for train_index, test_index in kf.split(df['United'], df['Label']):
    X_train_counts = count_vec.transform(np.array(df['United'])[train_index])
    X_test_counts = count_vec.transform(np.array(df['United'])[test_index])
    
    clf_cv = SVMClf.fit(X_train_counts, np.array(df['Label'])[train_index])
    yPred = clf_cv.predict(X_test_counts)
    yTrue = np.array(df['Label'])[test_index]
    print(classification_report(yPred, yTrue, target_names = df['Label'].unique()))

               precision    recall  f1-score   support

Entertainment       0.91      0.90      0.90      5070
   Technology       0.98      0.98      0.98      8964
     Business       0.95      0.96      0.96      2348
       Health       0.92      0.94      0.93      5977

     accuracy                           0.95     22359
    macro avg       0.94      0.94      0.94     22359
 weighted avg       0.95      0.95      0.95     22359

               precision    recall  f1-score   support

Entertainment       0.92      0.90      0.91      5150
   Technology       0.98      0.98      0.98      8920
     Business       0.94      0.96      0.95      2379
       Health       0.92      0.94      0.93      5910

     accuracy                           0.95     22359
    macro avg       0.94      0.94      0.94     22359
 weighted avg       0.95      0.95      0.95     22359

               precision    recall  f1-score   support

Entertainment       0.91      0.89      0.90      4986
   

Using pipelines

In [61]:
from sklearn.pipeline import Pipeline


SVM_bow = Pipeline([("SVM", svm.SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto'))])
RF_bow = Pipeline([("Random Forest", RandomForestClassifier(n_estimators = 100))])
SVM_svd = Pipeline([('feat', TruncatedSVD(n_components = 5, n_iter = 7, random_state = 42)), ("SVM", svm.SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto'))])
RF_svd = Pipeline([('feat', TruncatedSVD(n_components = 5, n_iter = 7, random_state = 42)), ("Random Forest", RandomForestClassifier(n_estimators = 100))])
LR_bow = Pipeline([("Logistic Regression", LogisticRegression(max_iter = 10000, random_state = 0))])

In [78]:
models=[
    ("SVM (BoW)", SVM_bow),
    ("Random Forest (BoW)", RF_bow),
    ("SVM (SVD)", SVM_svd),
    ("Random Forest (SVD)", RF_svd),
    ("My Method", LR_bow)
]

In [85]:
#Model evaluation function
d = {'Statistic Measure': ['Accuracy', 'Precision', 'Recall', 'F-Measure']}
evaluation = pd.DataFrame(data=d).set_index('Statistic Measure')

In [86]:
# Applying K-folds for pipeline
kf = KFold(n_splits=5)
for name, model in models:
	s = pd.Series()
	for train, test in kf.split(spare_matrix):
		X_train, X_test = X[train], X[test]
		y_train, y_test = y[train], y[test]
		train = model.fit(X_train, y_train)
		ypred = model.predict(X_test)
	d = [accuracy_score(y_test, ypred), precision_score(y_test, ypred, average = 'macro'), recall_score(y_test, ypred, average = 'macro'), f1_score(y_test, ypred, average = 'macro')]
	evaluation[name] = d

  s = pd.Series()
  s = pd.Series()
  s = pd.Series()
  s = pd.Series()


In [67]:
d = [accuracy_score(y_test, ypred), precision_score(y_test, ypred, average = 'macro'), recall_score(y_test, ypred, average = 'macro'), f1_score(y_test, ypred, average = 'macro')]
evaluation[name] = d

In [None]:
print(evaluation)

In [90]:
evaluation.to_csv('5fold_1a.csv', sep = ',')

# Preporcessing test dataset

In [102]:
test_data = pd.read_csv('test_without_labels.csv', sep = ',')
test_data['United'] = test_data['Title'] + ' ' + test_data['Content']

#Remove blank rows if any.
test_data['United'].dropna(inplace=True)

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

corpus_test = test_data['United']
                  
#Removing punctuations and numbers.                
corpus_test = corpus_test.str.replace('[^\w\s]','') #punctuations
corpus_test = corpus_test.str.replace('\d+', '') #numbers

test_data['United'] = corpus_test.map(text_preprocessing)

In [106]:
test  = count_vec.transform(test_data['United'])
id_n = test_data['Id']
print(test)
predictions = RFClas_BoW.predict(test)
prediction_table = pd.DataFrame(data = id_n, columns = ['Id'])
prediction_table['Predicted'] = predictions

for i in range(prediction_table.shape[0]):
    if prediction_table['Predicted'][i] == 0:
        prediction_table['Predicted'][i] = 'Business'
    if prediction_table['Predicted'][i] == 1:
        prediction_table['Predicted'][i] = 'Entertainment'
    if prediction_table['Predicted'][i] == 2:
        prediction_table['Predicted'][i] = 'Health'
    if prediction_table['Predicted'][i] == 3:
        prediction_table['Predicted'][i] = 'Technology'
        
prediction_table.to_csv('testSet_categories_1b.csv', sep =',', index=False)

  (0, 53)	1
  (0, 100)	1
  (0, 510)	1
  (0, 559)	1
  (0, 571)	1
  (0, 651)	2
  (0, 678)	1
  (0, 698)	1
  (0, 713)	1
  (0, 770)	1
  (0, 842)	1
  (0, 1148)	2
  (0, 1466)	1
  (0, 1585)	1
  (0, 1745)	3
  (0, 1808)	1
  (0, 1816)	1
  (0, 1852)	4
  (0, 1887)	1
  (0, 2101)	4
  (0, 2127)	1
  (0, 2134)	1
  (0, 2299)	1
  (0, 2727)	1
  (0, 2728)	1
  :	:
  (47911, 7795)	3
  (47911, 7807)	1
  (47911, 7945)	2
  (47911, 8418)	2
  (47911, 8429)	1
  (47911, 8478)	9
  (47911, 8485)	1
  (47911, 8504)	1
  (47911, 8522)	1
  (47911, 8561)	3
  (47911, 8588)	1
  (47911, 8806)	1
  (47911, 8809)	4
  (47911, 8880)	2
  (47911, 9071)	2
  (47911, 9099)	1
  (47911, 9277)	1
  (47911, 9327)	1
  (47911, 9452)	1
  (47911, 9528)	1
  (47911, 9702)	1
  (47911, 9827)	6
  (47911, 9882)	1
  (47911, 9936)	1
  (47911, 9963)	2
