In [191]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/wec-rec/news_train.csv
/kaggle/input/wec-rec-test/test.csv


In [192]:
df=pd.read_csv('/kaggle/input/wec-rec/news_train.csv')

label = 'Category'
sample_size = 10000

#Mapping according the task
mapping = {
    "Arts": 0,
    "business": 1,
    "humour": 2,
    "politics": 3,
    "sports": 4,
    "tech": 5
}

df[label] = df[label].map(mapping)

In [193]:
#Sorting by target column, as we are going to split into parts
sample = df.sort_values([label], ascending=True)\
               .groupby(label).head(sample_size)

#later, this step seemed to turn out to be redundent

In [194]:
#Preparing training data
sample['Text'] = sample['News_title']+' '+sample['News_headline']
sample = sample.drop(columns=['ID','News_headline','News_title'])
sample['Text'] = sample['Text'].fillna('')

In [195]:
sample


Unnamed: 0,Category,Text
0,0,"Do men enjoy sex more, or women? The Mahabhara..."
11664,0,Why Disha Patani playing Sangamithra hurts my ...
14812,0,The designer behind Deepika's wedding sari who...
4135,0,"Sex, violence aur pyaar: The preposterous plea..."
4133,0,"Why, Vivek Oberoi, Why? Just when his career c..."
...,...,...
12241,5,What Google and Facebook are doing to tackle I...
7151,5,Samsung Galaxy C8: When will it hit Indian mar...
10634,5,"No, India isn't ready for Mumbai-Pune hyperloo..."
13591,5,The Planetary Health Diet: How to save yoursel...


In [196]:
#Splitting train data for accuracy verification
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    sample.Text, 
    sample.Category, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify= sample.Category
)

In [197]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)
print(y_train.value_counts())
print(y_test.value_counts())

Shape of X_train:  (11245,)
Shape of X_test:  (2812,)
Category
3    8000
2     753
4     718
0     673
5     582
1     519
Name: count, dtype: int64
Category
3    2000
2     188
4     180
0     168
5     146
1     130
Name: count, dtype: int64


In [198]:
#First attempted to use KNeighborsClassifier
#RandomForestClassifier produced better results

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline


In [199]:
'''clf.fit(X_train, y_train)'''

'clf.fit(X_train, y_train)'

In [200]:
'''y_pred = clf.predict(X_test)'''

'y_pred = clf.predict(X_test)'

In [201]:
'''from sklearn.metrics import classification_report'''  

'from sklearn.metrics import classification_report'

In [202]:
'''print(classification_report(y_test, y_pred))'''

'print(classification_report(y_test, y_pred))'

In [203]:
#Attempted using RandomForestClassifier
#Preprocessing and removing punctuation produced better results

'''from sklearn.ensemble import RandomForestClassifier


clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),        #using the ngram_range parameter 
     ('Random Forest', RandomForestClassifier())         
])


clf.fit(X_train, y_train)



y_pred = clf.predict(X_test)



print(classification_report(y_test, y_pred))'''

"from sklearn.ensemble import RandomForestClassifier\n\n\nclf = Pipeline([\n     ('vectorizer_tfidf',TfidfVectorizer()),        #using the ngram_range parameter \n     ('Random Forest', RandomForestClassifier())         \n])\n\n\nclf.fit(X_train, y_train)\n\n\n\ny_pred = clf.predict(X_test)\n\n\n\nprint(classification_report(y_test, y_pred))"

In [204]:
#preprocessing to remove punctuation

# utlity function for pre-processing the text
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [205]:
sample['preprocessed_txt'] = sample['Text'].apply(preprocess) 

In [206]:
sample

Unnamed: 0,Category,Text,preprocessed_txt
0,0,"Do men enjoy sex more, or women? The Mahabhara...",man enjoy sex woman Mahabharata answer Book Ex...
11664,0,Why Disha Patani playing Sangamithra hurts my ...,Disha Patani play Sangamithra hurt Tamil sensi...
14812,0,The designer behind Deepika's wedding sari who...,designer Deepika wedding sari correct Sabyasac...
4135,0,"Sex, violence aur pyaar: The preposterous plea...",sex violence aur pyaar preposterous pleasure M...
4133,0,"Why, Vivek Oberoi, Why? Just when his career c...",Vivek Oberoi career revive star Vivek Oberoi w...
...,...,...,...
12241,5,What Google and Facebook are doing to tackle I...,Google Facebook tackle ISIS propaganda machine...
7151,5,Samsung Galaxy C8: When will it hit Indian mar...,Samsung Galaxy C8 hit indian market tech giant...
10634,5,"No, India isn't ready for Mumbai-Pune hyperloo...",India ready Mumbai Pune hyperloop deny benefit...
13591,5,The Planetary Health Diet: How to save yoursel...,Planetary Health Diet save earth new food plan...


In [218]:
'''X_train, X_test, y_train, y_test = train_test_split(
    sample.preprocessed_txt, 
    sample.Category,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=sample.Category
)'''

In [223]:

'''clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),         
     ('Random Forest', RandomForestClassifier())         
])


clf.fit(X_train, y_train)



y_pred = clf.predict(X_test)



print(classification_report(y_test, y_pred))

'''
###GOT MY FIRST OUTPUT HERE. WILL ATTEMPT TO TRAIN MODEL USING FULL
###TRAIN DATASET TO CHECK IF ACCURACY IMPROVES.

"\nclf = Pipeline([\n     ('vectorizer_tfidf',TfidfVectorizer()),         \n     ('Random Forest', RandomForestClassifier())         \n])\n\n\nclf.fit(X_train, y_train)\n\n\n\ny_pred = clf.predict(X_test)\n\n\n\nprint(classification_report(y_test, y_pred))\n\n"

In [239]:
###Attempted to use 100% of train dataset to train
###Got best score. 




X_train, y_train = sample['preprocessed_txt'], sample['Category']


In [225]:
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),         
     ('Random Forest', RandomForestClassifier())         
])


clf.fit(X_train, y_train)

In [227]:
###Test data processing
test_file= pd.read_csv("/kaggle/input/wec-rec-test/test.csv")

In [230]:
test_test = test_file
test_test['Text'] = test_test['News_title']+' '+test_test['News_headline']
test_test = test_test.drop(columns=['ID','News_headline','News_title'])
test_test['Text'] = test_test['Text'].fillna('')


In [231]:
test_test


Unnamed: 0,Text
0,How Blockchain can usher land reforms in the c...
1,What Kamal Nath as president means for Congres...
2,Why BJP's sweep in Rajasthan civic polls shoul...
3,Harsha Bhogle got the boot for rising intolera...
4,Amit Shah fighting Lok Sabha in Gandhinagar is...
...,...
3479,What game are BJP and Opposition playing over ...
3480,Afghanistan represents victory of India's soft...
3481,Why warring AIADMK factions want a piece of Ja...
3482,Mahasweta Devi was mother to downtrodden and S...


In [232]:
test_test['preprocessed_txt'] = test_test['Text'].apply(preprocess) 

In [233]:
y_pred = clf.predict(test_test['preprocessed_txt'])

In [234]:
test_out = pd.DataFrame(test_file['ID'])

In [235]:
test_out['Category']=y_pred

In [236]:
test_out

Unnamed: 0,ID,Category
0,1,3
1,2,3
2,3,3
3,4,4
4,5,3
...,...,...
3479,3480,3
3480,3481,3
3481,3482,3
3482,3483,3


In [237]:
y = pd.Series(y_pred)
y.value_counts()

3    2932
4     165
5     135
0     131
2      64
1      57
Name: count, dtype: int64

In [238]:
test_out.to_csv('/kaggle/working/outp_1.csv',index=False)

In [1]:
##To improve F1score, we could try to optimize the dataset so that excessive removal of words is not present.
##We could also try to run more epochs.
##I tried implementing a combination of using NN and also vectorization below.
##The output had a score of 0.83. Was satisfactory enough, so moving onto next task.

In [None]:
'''import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)'''

In [240]:
'''df=pd.read_csv('/kaggle/input/wec-rec/news_train.csv')

label = 'Category'
sample_size = 10000

#Mapping according the task
mapping = {
    "Arts": 0,
    "business": 1,
    "humour": 2,
    "politics": 3,
    "sports": 4,
    "tech": 5
}
df[label] = df[label].map(mapping)

sample = df.sort_values([label], ascending=True)\
               .groupby(label).head(sample_size)

sample['News_headline'] = sample['News_headline'].fillna('')
sample['News_title'] = sample['News_title'].fillna('')'''

In [241]:
sample


Unnamed: 0,ID,News_title,News_headline,Category
0,1,"Do men enjoy sex more, or women? The Mahabhara...","[Book Extract] From Anushasana Parva, translat...",0
11664,11665,Why Disha Patani playing Sangamithra hurts my ...,It is really unfortunate that Shruti Haasan wa...,0
14812,14813,The designer behind Deepika's wedding sari who...,It was not customised for Deepika. But all des...,0
4135,4136,"Sex, violence aur pyaar: The preposterous plea...",Anurag Kashyap's latest movie breaks new groun...,0
4133,4134,"Why, Vivek Oberoi, Why? Just when his career c...",Vivek Oberoi wasted his talent getting embroil...,0
...,...,...,...,...
12241,12242,What Google and Facebook are doing to tackle I...,The tech giants have joined hands with G-7 cou...,5
7151,7152,Samsung Galaxy C8: When will it hit Indian mar...,It's the tech giant's third dual camera phone ...,5
10634,10635,"No, India isn't ready for Mumbai-Pune hyperloop",There's no denying the benefits of the technol...,5
13591,13592,The Planetary Health Diet: How to save yoursel...,"Less meat, less pollution. A new study finds l...",5


In [242]:

'''import spacy


nlp = spacy.load("en_core_web_sm") 

def preprocess(text):

    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) '''

In [243]:
sample


Unnamed: 0,ID,News_title,News_headline,Category
0,1,"Do men enjoy sex more, or women? The Mahabhara...","[Book Extract] From Anushasana Parva, translat...",0
11664,11665,Why Disha Patani playing Sangamithra hurts my ...,It is really unfortunate that Shruti Haasan wa...,0
14812,14813,The designer behind Deepika's wedding sari who...,It was not customised for Deepika. But all des...,0
4135,4136,"Sex, violence aur pyaar: The preposterous plea...",Anurag Kashyap's latest movie breaks new groun...,0
4133,4134,"Why, Vivek Oberoi, Why? Just when his career c...",Vivek Oberoi wasted his talent getting embroil...,0
...,...,...,...,...
12241,12242,What Google and Facebook are doing to tackle I...,The tech giants have joined hands with G-7 cou...,5
7151,7152,Samsung Galaxy C8: When will it hit Indian mar...,It's the tech giant's third dual camera phone ...,5
10634,10635,"No, India isn't ready for Mumbai-Pune hyperloop",There's no denying the benefits of the technol...,5
13591,13592,The Planetary Health Diet: How to save yoursel...,"Less meat, less pollution. A new study finds l...",5


In [244]:
'''sample['pre_title'] = sample['News_title'].apply(preprocess) 
sample['pre_head'] = sample['News_headline'].apply(preprocess) '''

In [245]:
'''clf1 = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),         
     ('Random Forest', RandomForestClassifier())         
])

clf2 = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),         
     ('Random Forest', RandomForestClassifier())         
])'''

In [247]:
'''x_train1,y_train1 = sample['pre_title'], sample['Category']
x_train2,y_train2 = sample['pre_head'], sample['Category']'''

In [248]:
sample


Unnamed: 0,ID,News_title,News_headline,Category,pre_title,pre_head
0,1,"Do men enjoy sex more, or women? The Mahabhara...","[Book Extract] From Anushasana Parva, translat...",0,man enjoy sex woman Mahabharata answer,Book Extract Anushasana Parva translate Arshia...
11664,11665,Why Disha Patani playing Sangamithra hurts my ...,It is really unfortunate that Shruti Haasan wa...,0,Disha Patani play Sangamithra hurt Tamil sensi...,unfortunate Shruti Haasan walk movie
14812,14813,The designer behind Deepika's wedding sari who...,It was not customised for Deepika. But all des...,0,designer Deepika wedding sari correct Sabyasac...,customise Deepika design edition exclusive
4135,4136,"Sex, violence aur pyaar: The preposterous plea...",Anurag Kashyap's latest movie breaks new groun...,0,sex violence aur pyaar preposterous pleasure M...,Anurag Kashyap late movie break new ground pic...
4133,4134,"Why, Vivek Oberoi, Why? Just when his career c...",Vivek Oberoi wasted his talent getting embroil...,0,Vivek Oberoi career revive star,Vivek Oberoi waste talent getting embroil huge...
...,...,...,...,...,...,...
12241,12242,What Google and Facebook are doing to tackle I...,The tech giants have joined hands with G-7 cou...,5,Google Facebook tackle ISIS propaganda machinery,tech giant join hand g-7 country draw effectiv...
7151,7152,Samsung Galaxy C8: When will it hit Indian mar...,It's the tech giant's third dual camera phone ...,5,Samsung Galaxy C8 hit indian market,tech giant dual camera phone month
10634,10635,"No, India isn't ready for Mumbai-Pune hyperloop",There's no denying the benefits of the technol...,5,India ready Mumbai Pune hyperloop,deny benefit technology catch
13591,13592,The Planetary Health Diet: How to save yoursel...,"Less meat, less pollution. A new study finds l...",5,Planetary Health Diet save earth new food plan,meat pollution new study find meat consumption...


In [249]:
'''clf1.fit(x_train1, y_train1)
clf2.fit(x_train2, y_train2)'''

In [250]:
'''test_file= pd.read_csv("/kaggle/input/wec-rec-test/test.csv")'''

In [None]:
'''test_test = test_file
test_test['Text'] = test_test['News_title']+' '+test_test['News_headline']
test_test = test_test.drop(columns=['ID','News_headline','News_title'])
test_test['Text'] = test_test['Text'].fillna('')'''