In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import re
import sys
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

import warnings
warnings.filterwarnings("ignore")

In [None]:
article_topics_df  = pd.read_csv("/kaggle/input/vmondaq-final/article_topics.csv", engine='python')
articles_df = pd.read_csv("/kaggle/input/vmondaq-final/articles.csv", header=None)
topic_relationships_df = pd.read_csv("/kaggle/input/vmondaq-final/topic_relationships.csv", engine='python')

In [None]:
article_topics_df.shape, articles_df.shape, topic_relationships_df.shape

In [None]:
article_topics_df.head()

In [None]:
articles_df.head()

In [None]:
articles_df.columns = ['article_id', 'title', 'body','country_desc', 'category']

In [None]:
article_topics_df.nunique()

In [None]:
topic_relationships_df.head()

In [None]:
topic_relationships_df.nunique()

In [None]:
articles_df.nunique()

In [None]:
topic_relationships_df.columns = ['parent_topic_id', 'topic_id']


In [None]:
df = pd.merge(article_topics_df, topic_relationships_df, on='topic_id', how='left')

In [None]:
df['parent_topic_id'] = df['parent_topic_id'].fillna(df.topic_id)

In [None]:
articles_df = articles_df.dropna()

In [None]:
articles_df.shape, df.shape

In [None]:
articles_df.dtypes, df.dtypes

In [None]:
final_df = df[['article_id', 'parent_topic_id']]
final_df['value'] = 1
final_df.head()

In [None]:
result_df = pd.pivot_table(final_df, values = ['value'], index=['article_id'], columns = 'parent_topic_id').reset_index()

In [None]:
result_df.fillna(0, inplace=True)
result_df.head()

In [None]:
f_df = pd.DataFrame(result_df.values)

In [None]:
f_df.shape

In [None]:
f_df.columns

In [None]:
f_df.columns = ['article_id' , 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 'topic_7','topic_8',
                     'topic_9', 'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 'topic_15', 'topic_16', 'topic_17',
                     'topic_18', 'topic_19', 'topic_20', 'topic_21', 'topic_22', 'topic_23', 'topic_24', 'topic_25', 'topic_26',
                     'topic_28', 'topic_29', 'topic_30', 'topic_31', 'topic_32']

In [None]:
f_df.head()

In [None]:
r_df = pd.merge(articles_df, f_df, on='article_id', how='left')


In [None]:
r_df.head()


In [None]:
data = r_df

In [None]:
data['body']

****Defining Function to Parse the HTML Text body of the articles

In [None]:
def parse_text(html):
    soup = BeautifulSoup(html)
    text = soup.get_text()
    return text

In [None]:
data.isna().sum()

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from bs4 import BeautifulSoup
import string
import csv
csv.field_size_limit()
csv.field_size_limit(256<<10)
csv.field_size_limit()
data['clean_text'] = data['body'].apply(lambda x: parse_text(x))

In [None]:
data.head()

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

In [None]:
df = data

In [None]:
# Lower casing
df['clean_text'] = df.clean_text.map(
    lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
)

df.head()

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None

****Lower Casing

Lower casing is a common text preprocessing technique. The idea is to convert the input text into same casing format so that 'text', 'Text' and 'TEXT' are treated the same way.

This is more helpful for text featurization techniques like frequency, tfidf as it helps to combine the same words together thereby reducing the duplication and get correct counts / tfidf values.

This may not be helpful when we do tasks like Part of Speech tagging (where proper casing gives some information about Nouns and so on) and Sentiment Analysis (where upper casing refers to anger and so on)

By default, lower casing is done my most of the modern day vecotirzers and tokenizers like sklearn TfidfVectorizer and Keras Tokenizer. So we need to set them to false as needed depending on our use case.



In [None]:
df["text_lower"] = df["clean_text"].str.lower()
df.head()

****Removal of Punctuations

One another common text preprocessing technique is to remove the punctuations from the text data. This is again a text standardization process that will help to treat 'hurray' and 'hurray!' in the same way.

We also need to carefully choose the list of punctuations to exclude depending on the use case. For example, the string.punctuation in python contains the following punctuation symbols

!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`

We can add or remove more punctuations as per our need.

In [None]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text_wo_punct"] = df["text_lower"].apply(lambda text: remove_punctuation(text))
df.head()

In [None]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

****Removal of stopwords

Stopwords are commonly occuring words in a language like 'the', 'a' and so on. They can be removed from the text most of the times, as they don't provide valuable information for downstream analysis. In cases like Part of Speech tagging, we should not remove them as provide very valuable information about the POS.

These stopword lists are already compiled for different languages and we can safely use them. For example, the stopword list for english language from the nltk package can be seen below.

In [None]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text_wo_stop"] = df["text_wo_punct"].apply(lambda text: remove_stopwords(text))
df.head()

In [None]:
from collections import Counter
cnt = Counter()
for text in df["text_wo_stop"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)


****Removal of Frequent words

In the previous preprocessing step, we removed the stopwords based on language information. But say, if we have a domain specific corpus, we might also have some frequent words which are of not so much importance to us.

So this step is to remove the frequent words in the given corpus. If we use something like tfidf, this is automatically taken care of.

Let us get the most common words and then remove them in the next step

In [None]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["text_wo_stopfreq"] = df["text_wo_stop"].apply(lambda text: remove_freqwords(text))
df.head()

****Removal of Rare words

This is very similar to previous preprocessing step but we will remove the rare words from the corpus.

In [None]:
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["text_wo_stopfreqrare"] = df["text_wo_stopfreq"].apply(lambda text: remove_rarewords(text))
df.head()

****Stemming

Stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form (From Wikipedia)

For example, if there are two words in the corpus walks and walking, then stemming will stem the suffix to make them walk. But say in another example, we have two words console and consoling, the stemmer will remove the suffix and make them consol which is not a proper english word.

There are several type of stemming algorithms available and one of the famous one is porter stemmer which is widely used. We can use nltk package for the same.

In [None]:
from nltk.stem.porter import PorterStemmer 

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["text_stemmed"] = df["text_wo_stopfreqrare"].apply(lambda text: stem_words(text))
df.head()

****Lemmatization

Lemmatization is similar to stemming in reducing inflected words to their word stem but differs in the way that it makes sure the root word (also called as lemma) belongs to the language.

As a result, this one is generally slower than stemming process. So depending on the speed requirement, we can choose to use either stemming or lemmatization.

Let us use the WordNetLemmatizer in nltk to lemmatize our sentences



In [None]:
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["text_lemmatized"] = df["text_wo_stopfreqrare"].apply(lambda text: lemmatize_words(text))
df.head()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
X = df['clean_text']
y= df.drop(['article_id', 'title', 'body', 'country_desc', 'category', 'clean_text', 'text_lower', 'text_wo_punct', 'text_wo_stop',
       'text_wo_stopfreq', 'text_wo_stopfreqrare', 'text_stemmed',
       'text_lemmatized' ], axis=1)

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=144)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = set(stopwords.words('english')))
vectorizer.fit(X)

In [None]:
x_train_tfidf_vec = vectorizer.transform(X_train)
x_test_tfidf_vec = vectorizer.transform(X_test)
print(x_train_tfidf_vec.shape, x_test_tfidf_vec.shape)

In [None]:
x_train_tfidf_vec.shape, x_test_tfidf_vec.shape

In [None]:
y.head()

In [None]:
def modelling(clf, x_train, y_train, x_test, y_test):
    
  
    
    clf.fit(x_train, y_train)
    y_train_pred = clf.predict(x_train)
    
    y_test_pred = clf.predict(x_test)
    
    vals = precision_recall_fscore_support(y_test, y_test_pred, average='macro')
    precision = vals[0]
    recall = vals[1]
    f1 = vals[2]
    acc = accuracy_score(y_test, y_test_pred)
    print("accuracy: ", acc, f1)
    print("confusion matrix for CV is ")
    print(confusion_matrix(y_test, y_test_pred ))
    
    return y_train_pred, y_test_pred, precision, recall, f1, acc

In [None]:
y_train_pred, y_test_pred, precision, recall, f1, acc = modelling(LogisticRegression(C = 1), 
                                                                            x_train_tfidf_vec, 
                                                                            y_train['topic_2'], 
                                                                            x_test_tfidf_vec,
                                                                             y_test['topic_2'])