# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import re
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import nltk
from tqdm import tqdm
from preprocess import clean_text

# Reading inputs

In [2]:
data_train = pd.read_excel("../data/Data_Train.xlsx")
data_test = pd.read_excel("../data/Data_Test.xlsx")

In [3]:
data_train.head(5)

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3.0
1,How formidable is the opposition alliance amon...,0.0
2,Most Asian currencies were trading lower today...,3.0
3,"If you want to answer any question, click on ‘...",1.0
4,"In global markets, gold prices edged up today ...",3.0


In [4]:
data_test.head(5)

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...


## Data Cleaning

In [5]:
from preprocess import clean_text

## Train Test Split

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X = data_train.drop(columns = ['SECTION'])
y = data_train['SECTION']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .2,random_state = 42,stratify = y)

In [38]:
X_train.shape

(6102, 7)

In [39]:
X_test.shape

(1526, 7)

In [40]:
y_train.value_counts(normalize=True)

1    0.363324
2    0.252212
0    0.221075
3    0.163389
Name: SECTION, dtype: float64

In [41]:
y_test.value_counts(normalize=True)

1    0.363696
2    0.252294
0    0.220839
3    0.163172
Name: SECTION, dtype: float64

# Vectorization

## TfIdf

In [42]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#### TfIdf

In [44]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train['clean_story'])
X_test_tfidf = tfidf.transform(X_test['clean_story'])


# Model building 


#### Choice of metrics : Accuracy as dataset is balanced.
(However we will also look at individual precison and recall)

In [58]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

## Building Logistic Regression model for all six vectors.

In [71]:
from sklearn.linear_model import LogisticRegression

### TFIDF LR

In [74]:
lr_tfidf = LogisticRegression()
lr_tfidf.fit(X_train_tfidf,y_train)
lr_tfidf_pred = lr_tfidf.predict(X_test_tfidf)
lr_tfidf_acc = accuracy_score(y_test,lr_tfidf_pred)

In [75]:
lr_tfidf_acc

0.9665792922673656

## Performance metrics dataset

In [86]:
performance_metrics = performance_metrics.assign(max_Acc = lambda x : (x.max(axis = 1)))
performance_metrics

Unnamed: 0,bow,tfidf,avg_w2v,avg_w2v_eng_only,tfidf_w2v,tfidf_w2v_eng_only,max_Acc
KNN,0.75426,0.953473,0.942988,0.930537,0.936435,0.919397,0.953473
LR,0.963303,0.966579,0.951507,0.944954,0.949541,0.935125,0.966579


#### Gridsearch CV

In [87]:
from sklearn.model_selection import GridSearchCV

## Hyperparameter tuning for LR tfidf

In [94]:
lr_tfidf_cv = LogisticRegression()
param_grid_lr = { 'C' : [900,950,1000,1100,1150,1200],
     'penalty' : ['l1', 'l2']}

lr_search = GridSearchCV(estimator=lr_tfidf_cv, param_grid=param_grid_lr, scoring='accuracy', n_jobs=-1, cv=5)

lr_search.fit(X_train_tfidf,y_train)

        nan 0.97394517        nan 0.97394503        nan 0.97443697]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [900, 950, 1000, 1100, 1150, 1200],
                         'penalty': ['l1', 'l2']},
             scoring='accuracy')

In [95]:
lr_search.best_params_

{'C': 900, 'penalty': 'l2'}

In [96]:
lr_search.best_score_

0.9747645708301447

In [97]:
#best model
lr_search.best_estimator_

LogisticRegression(C=900)

In [98]:
best_model = LogisticRegression(C = 900 , penalty = 'l2')
best_model

LogisticRegression(C=900)

## Generating predictions on test data

In [99]:
data_test.head()

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...


In [100]:
data_test['clean_story'] = data_test['STORY'].apply(clean_text)

In [101]:
data_test.head()

Unnamed: 0,STORY,clean_story
0,2019 will see gadgets like gaming smartphones ...,see gadget like game smartphones wearable medi...
1,It has also unleashed a wave of changes in the...,also unleashed wave change mcu make sure futur...
2,It can be confusing to pick the right smartpho...,confuse pick right smartphone yourself segrega...
3,The mobile application is integrated with a da...,mobile application integrate dashboard confirm...
4,We have rounded up some of the gadgets that sh...,round gadget show left indelible mark on consu...


In [102]:
tfidf_data_test = tfidf.transform(data_test['clean_story'])

In [103]:
data_test_pred = lr_search.best_estimator_.predict(tfidf_data_test)

In [104]:
data_test_pred

array([1, 2, 1, ..., 1, 0, 1], dtype=int64)

In [105]:
data_test_pred_df = pd.DataFrame(data_test_pred,columns = ['SECTION'])

In [106]:
data_test_pred_df.to_excel('News_Category_Prediction.xlsx',index = False)

In [107]:
data_test_pred_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2748 entries, 0 to 2747
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   SECTION  2748 non-null   int64
dtypes: int64(1)
memory usage: 21.6 KB


In [108]:
data_test.shape

(2748, 2)

In [109]:
len(data_test_pred_df)

2748